-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathplot_number_of_contributors.py
304 lines (246 loc) · 12.6 KB
/
plot_number_of_contributors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
import argparse
import os
import logging
from collections import defaultdict
from datetime import datetime, timedelta
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
from git import Repo
from matplotlib.ticker import MaxNLocator
# Function to read and parse the .mailmap file
def parse_mailmap(repo_path):
mailmap = {}
try:
with open(repo_path + '/.mailmap', 'r') as file:
for line in file:
canonical_name = ''
for part in [n for n in line.split('>') if n != '\n' and n]:
name, email = part.split('<')
name = name.strip()
if not canonical_name:
canonical_name = name
mailmap[email] = canonical_name
except FileNotFoundError:
logging.warning(".mailmap file not found, continuing without it")
return mailmap
# Function to process contributors' continuous activity periods
def process_activity_periods(contributors, window, granularity=7):
"""
Processes the activity periods of contributors based on commit dates with a specified granularity.
This function groups the commit dates of each contributor into continuous
activity periods. A continuous activity period is defined as a time
range where the gap between any two consecutive commits does not exceed
the specified 'window' duration.
Args:
contributors (dict): A dictionary where keys are contributor names and values are sets of commit dates.
window (timedelta): The maximum allowed gap between consecutive commits to be considered part of the same activity period.
granularity (int): The minimum number of days to consider for an activity period. If an activity period is shorter than this, it is extended to match the granularity.
Returns:
dict: A dictionary where keys are contributor names and values are lists of tuples. Each tuple represents an activity period (start_date, end_date).
"""
activity_periods = defaultdict(list)
for contributor, dates in contributors.items():
sorted_dates = sorted(list(dates))
start_date = sorted_dates[0]
last_date = start_date
for date in sorted_dates[1:]:
if date - last_date > window:
# Adjust the period length based on the granularity
actual_length = (last_date - start_date).days + 1
period_length = max(actual_length, granularity)
adjusted_end_date = start_date + timedelta(days=period_length)
activity_periods[contributor].append((start_date, adjusted_end_date, actual_length))
start_date = date
last_date = date
actual_length = (last_date - start_date).days + 1
period_length = max(actual_length, granularity)
adjusted_end_date = start_date + timedelta(days=period_length)
# Append the last period
activity_periods[contributor].append((start_date, adjusted_end_date, actual_length))
return activity_periods
# Function to count contributors and their activity dates with a minimum contribution filter
def count_contributors(repo_path, branch, mailmap, exclusions, delta, window, granularity):
repo = Repo(repo_path)
contributors = defaultdict(set)
if delta:
since = datetime.now() - delta
commits = list(repo.iter_commits(branch, since=since.strftime('%Y-%m-%d')))
else:
commits = list(repo.iter_commits(branch))
total_commits = len(commits)
logging.info(f"Analyzing {total_commits} commits...")
# Iterating over commits in the specified branch
for i, commit in enumerate(commits, start=1):
commit_date = commit.committed_datetime.date()
contributor = commit.author.email
# Using mailmap to resolve duplicate contributors
contributor = mailmap.get(contributor, commit.author.name)
# Check if the contributor is not in the exclusions list
if not any(exclusion in contributor for exclusion in exclusions):
contributors[contributor].add(commit_date)
# Print progress every 10% of the total commits
if i % (total_commits // 10) == 0 or i == total_commits:
logging.info(f"Processed {i}/{total_commits} commits ({(i / total_commits) * 100:.1f}%)")
return process_activity_periods(contributors, window, granularity)
# Function to plot the data
def plot_contributors(activity_periods, args, less_than_year=False):
plt.figure(figsize=(15, 20))
color_map = plt.colormaps.get_cmap('hsv')
y_labels = []
y_ticks = []
min_date = min(period[0] for periods in activity_periods.values() for period in periods)
max_date = max(period[1] for periods in activity_periods.values() for period in periods)
# Drawing bars for activity periods
for i, (contributor, periods) in enumerate(activity_periods.items()):
print(f"Contributor: {contributor}")
y_labels.append(contributor)
y_ticks.append(i)
for start_date, end_date, duration in periods:
print(f" Activity period: {start_date} to {end_date}. Duration: {duration}")
plt.barh(i, (end_date - start_date).days, left=start_date, height=0.4, color=color_map(i), edgecolor='black')
# Adding horizontal lines for better readability
linewidth = 1.0 if i % 10 == 0 else 0.5
plt.axhline(i, color='gray', linestyle='--', linewidth=linewidth)
# Adding gray vertical lines at the start of each year and labeling them
current_year = min_date.year
year_positions = []
while current_year <= max_date.year:
year_start = datetime(current_year, 1, 1)
plt.axvline(year_start, color='gray', linestyle='-', linewidth=0.8)
year_positions.append(year_start)
current_year += 1
plt.yticks(y_ticks, y_labels)
plt.xlabel('Date')
plt.ylabel('Contributors')
plt.title('Continuous Contribution Periods of Contributors')
if less_than_year:
# Setting less frequent date ticks on x-axis and adding year labels
plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=1))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
else:
plt.gca().xaxis.set_major_locator(mdates.YearLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
plt.xticks(year_positions,
[datetime(year, 1, 1).strftime('%Y') for year in range(min_date.year, max_date.year + 1)])
plt.xticks(rotation=45)
plt.ylim(-1, len(activity_periods))
# Calculate the total duration and buffer
total_duration = (max_date - min_date).days
buffer_days = int(total_duration * 0.01) # 1% of the total duration
buffer = timedelta(days=buffer_days)
plt.xlim(min_date - buffer, max_date + buffer)
plt.tight_layout()
plt.savefig(args.activity_plot_file)
logging.info(f"Activity plot saved to: {os.path.abspath(args.activity_plot_file)}")
def aggregate_contributors_by_time(activity_periods):
unique_dates = set()
end_dates = set()
for periods in activity_periods.values():
for start_date, end_date, _ in periods:
unique_dates.add(start_date)
unique_dates.add(end_date)
end_dates.add(end_date)
sorted_dates = sorted(unique_dates)
contributor_count_by_date = defaultdict(int)
for date in sorted_dates:
for periods in activity_periods.values():
for start_date, end_date, _ in periods:
if start_date <= date <= end_date:
contributor_count_by_date[date] += 1
if date in end_dates:
contributor_count_by_date[date] -= 1
return sorted(contributor_count_by_date.items())
def plot_contributor_count_over_time(aggregated_data, args):
"""
Plots the number of contributors over time using a bar chart.
Args:
aggregated_data: A list of tuples, where each tuple contains a date and the corresponding number of contributors on that date.
"""
dates, counts = zip(*aggregated_data) # Unzip the date and count tuples
plt.figure(figsize=(20, 6))
# Define a color map for different counts
unique_counts = sorted(set(counts))
color_map = plt.colormaps.get_cmap('tab20')
count_to_color = {count: color_map(i) for i, count in enumerate(unique_counts)}
for i in range(len(dates) - 1):
start_date = dates[i]
end_date = dates[i + 1]
count = counts[i]
width = (end_date - start_date).days
plt.bar(start_date, count, width=width, align='edge', edgecolor='black', color=count_to_color[count])
# Set x-axis major locator and formatter
plt.gca().xaxis.set_major_locator(mdates.YearLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
plt.xticks(rotation=45)
# Set y-axis to have a step size of 1
plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True))
# Set axis labels and title
plt.xlabel('Date')
plt.ylabel('Number of Contributors')
plt.title('Number of Contributors Over Time')
plt.grid(True)
# Calculate the total duration and buffer for x-axis
min_date = min(dates)
max_date = max(dates)
total_duration = (max_date - min_date).days
buffer_days = int(total_duration * 0.01) # 1% of the total duration
buffer = timedelta(days=buffer_days)
plt.xlim(min_date - buffer, max_date + buffer)
# Add buffer for y-axis
max_count = max(counts)
y_buffer = max_count * 0.01 # 1% of the maximum count
plt.ylim(-y_buffer, max_count + y_buffer)
plt.tight_layout()
plt.savefig(args.contributor_count_plot_file)
logging.info(f"Contributor count plot saved to: {os.path.abspath(args.contributor_count_plot_file)}")
def main():
parser = argparse.ArgumentParser(description='Plot contributor activity over time.')
parser.add_argument('--repo_path', type=str, default='.', help='Path to the repository')
parser.add_argument('--branch', type=str, default='main', help='Branch to analyze')
parser.add_argument('--exclusions', nargs='*', default=["dependabot", "snyk"], help='List of contributors to exclude')
parser.add_argument('--delta_days', type=int, default=365 * 30, help='Number of days to look back for commits')
parser.add_argument('--window_days', type=int, default=90, help='Window of days for activity period')
parser.add_argument('--granularity_days', type=int, default=15, help='Granularity of activity period in days')
parser.add_argument('--contribution_duration', type=int, default=1, help='Minimum contribution duration to consider')
parser.add_argument('--less_than_year', action='store_true', help='Use less frequent date ticks on x-axis')
parser.add_argument('--activity_plot_file', type=str, default='out/activity_plot.png', help='File name for the activity plot')
parser.add_argument('--contributor_count_plot_file', type=str, default='out/contributor_count_plot.png', help='File name for the contributor count plot')
args = parser.parse_args()
# Define color codes
RESET = "\033[0m"
COLORS = {
'DEBUG': "\033[94m", # Blue
'INFO': "\033[92m", # Green
'WARNING': "\033[93m", # Yellow
'ERROR': "\033[91m", # Red
'CRITICAL': "\033[95m" # Magenta
}
class ColoredFormatter(logging.Formatter):
def format(self, record):
log_fmt = f"{COLORS.get(record.levelname, RESET)}%(asctime)s - %(levelname)s - %(message)s{RESET}"
formatter = logging.Formatter(log_fmt)
return formatter.format(record)
# Set up logging with colors
handler = logging.StreamHandler()
handler.setFormatter(ColoredFormatter())
logging.basicConfig(level=logging.INFO, handlers=[handler])
logging.info("Starting contributor activity analysis")
mailmap = parse_mailmap(args.repo_path)
# Analyzing the repository for continuous contribution periods
activity_periods = count_contributors(
args.repo_path, args.branch, mailmap, args.exclusions,
delta=timedelta(days=args.delta_days),
window=timedelta(days=args.window_days),
granularity=args.granularity_days
)
activity_periods = dict(
(c, p) for c, p in activity_periods.items() if sum(d for sd, ed, d in p) >= args.contribution_duration
)
# Plotting the contribution activity on the graph
plot_contributors(activity_periods, args, less_than_year=args.less_than_year)
# Aggregate contributor data by time
aggregated_data = aggregate_contributors_by_time(activity_periods)
# Plotting the number of contributors over time
plot_contributor_count_over_time(aggregated_data, args)
if __name__ == '__main__':
main()