-
Notifications
You must be signed in to change notification settings - Fork 2
/
analyze-netflix-data.py
177 lines (149 loc) · 7.02 KB
/
analyze-netflix-data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import argparse
import csv
import jinja2
import json
import math
import os
import re
# 0:01:05 -> 65
def durationTimeToSeconds(duration):
try:
[hour, minutes, seconds] = duration.split(':')
return int(hour)*3600 + int(minutes)*60 + int(seconds)
except:
return 0
# 65 -> 00:01:05
def secondsToDurantion(seconds):
hours = math.floor(seconds/3600)
remainingSeconds = seconds - (hours*3600)
minutes = math.floor(remainingSeconds/60)
remainingSeconds = remainingSeconds - (minutes*60)
return '{:02d}:{:02d}:{:02d}'.format(hours, minutes, remainingSeconds)
def get_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("-input", help="Viewing activity CSV file location")
parser.add_argument("-output", help="Path for the output files")
return parser.parse_args()
def parseNetflixData(inputFileName):
# CSV headers:
# Profile Name,Start Time,Duration,Attributes,Title,Supplemental Video Type,Device Type,Bookmark,Latest Bookmark,Country
netflixData = []
with open(inputFileName, encoding="utf-8") as netflixCSV:
for row in csv.reader(netflixCSV):
supplementalVideoType = row[5]
if len(supplementalVideoType):
continue
profile = row[0]
date = row[1]
duration = row[2]
title = row[4]
serieMatching = re.search(
r'(.*): (Season|Part|Vol\.|Series|Chapter|Temporada|Parte|Universo|Capítulo) ([ a-zA-Záéíê\d]*( Remix)*): (.*)', title)
if serieMatching:
netflixData.append({'movie': '',
'serie': serieMatching.group(1),
'season': 'Season {}'.format(serieMatching.group(3)),
'episode': '{}'.format(serieMatching.group(5)),
'date': date,
'profile': profile,
'duration': durationTimeToSeconds(duration)})
else:
netflixData.append({'movie': title,
'serie': '',
'season': '',
'episode': '',
'date': date,
'profile': profile,
'duration': durationTimeToSeconds(duration)})
# Drop first as it's the header
netflixData.pop(0)
return netflixData
def getMoviesAndSeriesObj(data):
moviesWatchedTimes = {}
seriesWatchedTime = {}
profiles = set()
for item in data:
profile = item['profile']
profiles.add(profile)
if not profile in moviesWatchedTimes:
moviesWatchedTimes[profile] = {}
if not profile in seriesWatchedTime:
seriesWatchedTime[profile] = {}
if item['movie']:
movie = item['movie']
if not movie in moviesWatchedTimes[profile]:
moviesWatchedTimes[profile][movie] = 0
moviesWatchedTimes[profile][movie] += item['duration']
if item['serie']:
serie = item['serie']
if not serie in seriesWatchedTime[profile]:
seriesWatchedTime[profile][serie] = {}
season = item['season']
if not season in seriesWatchedTime[profile][serie]:
seriesWatchedTime[profile][serie][season] = {}
episode = item['episode']
if not episode in seriesWatchedTime[profile][serie][season]:
seriesWatchedTime[profile][serie][season][episode] = 0
seriesWatchedTime[profile][serie][season][episode] += item['duration']
return [profiles, moviesWatchedTimes, seriesWatchedTime]
def getOutputFilePath(outputDir, fileName):
if not os.path.isdir(outputDir):
try:
os.mkdir(outputDir)
except:
return None
return os.path.join(outputDir, fileName)
def generateHTMLPage(outputDir, profiles, moviesWatchedTimes, seriesWatchedTime, visualizationData):
# Datatable
watchedTableInfo = []
for profile in moviesWatchedTimes:
for item in moviesWatchedTimes[profile]:
watchedTableInfo.append(dict(profile=profile, title=item, type='Movie',
total_seconds=moviesWatchedTimes[profile][item], total_time=secondsToDurantion(moviesWatchedTimes[profile][item])))
for profile in seriesWatchedTime:
for item in seriesWatchedTime[profile]:
totalWatchedTime = 0
seasons = seriesWatchedTime[profile][item]
for season in seasons:
for episode in seasons[season]:
totalWatchedTime += seriesWatchedTime[profile][item][season][episode]
watchedTableInfo.append(dict(profile=profile, title=item, type='Series',
total_seconds=totalWatchedTime, total_time=secondsToDurantion(totalWatchedTime)))
loader = jinja2.FileSystemLoader('netflix-data-template.html')
env = jinja2.Environment(loader=loader)
with open(getOutputFilePath(outputDir, 'index.html'), 'w', encoding="utf-8") as output:
output.write(env.get_template('').render(
watched_table=watchedTableInfo, visualization_data=visualizationData))
def generateJsonForVisualization(outputDir, profiles, moviesWatchedTimes, seriesWatchedTime):
visualizationJson = {"name": "Profiles", "children": []}
for profile in profiles:
profiledMovieItems = []
for item in moviesWatchedTimes[profile]:
profiledMovieItems.append({"name": item,
"value": moviesWatchedTimes[profile][item]})
profileMovies = {"name": "Movies", "children": profiledMovieItems}
profiledSeriesItems = []
for item in seriesWatchedTime[profile]:
seasons = seriesWatchedTime[profile][item]
allSeasons = []
for season in seasons:
episodes = seasons[season]
totalTimeForEpisodes = 0
for episode in episodes:
totalTimeForEpisodes += seriesWatchedTime[profile][item][season][episode]
allSeasons.append({"name": '{}'.format(season),
"value": totalTimeForEpisodes})
profiledSeriesItems.append({"name": item, "children": allSeasons})
profileSeries = {"name": "Series", "children": profiledSeriesItems}
profileMoviesAndSeries = {"name": profile,
"children": [profileMovies, profileSeries]}
visualizationJson["children"].append(profileMoviesAndSeries)
return visualizationJson
def main():
args = get_arguments()
data = parseNetflixData(args.input)
[profiles, movies, series] = getMoviesAndSeriesObj(data)
visualizationData = generateJsonForVisualization(args.output, profiles, movies, series)
generateHTMLPage(args.output, profiles, movies, series, visualizationData)
if __name__ == "__main__":
main()