-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
4402342
commit 434a196
Showing
12 changed files
with
87,425 additions
and
0 deletions.
There are no files selected for viewing
3,377 changes: 3,377 additions & 0 deletions
3,377
Links Analysis Cross-Platforms/ChangeOrg_URLs.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
1,951 changes: 1,951 additions & 0 deletions
1,951
Links Analysis Cross-Platforms/Twitter_URLs-Unshort-4months.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
823 changes: 823 additions & 0 deletions
823
Links Analysis Cross-Platforms/Twitter_URLs-Unshort-dec.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
6,746 changes: 6,746 additions & 0 deletions
6,746
Links Analysis Cross-Platforms/YouTube_URLs.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
|
||
import pandas as pd | ||
from datetime import datetime | ||
import numpy as np | ||
import re | ||
from collections import Counter | ||
|
||
|
||
class URLS: | ||
def __init__(self, df): | ||
self.df = df | ||
|
||
def datetime_from_timestamp(self, timestamp_column): | ||
""" | ||
converts time stamp to ddatetime object | ||
""" | ||
return self.df[timestamp_column].apply( | ||
lambda x: x if np.isnan(x) else datetime.fromtimestamp(x) | ||
) | ||
|
||
def months_filteration(self, datetime_column, start, end): | ||
""" | ||
Filter dataframe due to time, get data between two dates | ||
""" | ||
return ( | ||
self.df[ | ||
(self.df[datetime_column].astype(str) >= start) | ||
& (self.df[datetime_column].astype(str) < end) | ||
] | ||
.reset_index() | ||
.drop(columns="index") | ||
) | ||
|
||
|
||
def get_urls(string): | ||
""" | ||
Get links from text | ||
""" | ||
regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))" | ||
url = re.findall(regex, string) | ||
urls = [x[0] for x in url] | ||
return urls | ||
|
||
|
||
def urls_df(filtered_df, column="n_urls"): | ||
return filtered_df[filtered_df[column] > 0].reset_index().drop(columns="index") | ||
|
||
|
||
def one_link_df(df, n_urls_column="n_urls", url_column="url", urls="urls"): | ||
one_link_df = df[df[n_urls_column] == 1] | ||
one_link_df[url_column] = one_link_df[urls].apply(lambda x: x[0]) | ||
one_link_df = one_link_df.reset_index().drop(columns="index") | ||
return one_link_df.reset_index().drop(columns="index") | ||
|
||
|
||
def one_link_urls(one_link_df, column="urls"): | ||
return one_link_df[column].values | ||
|
||
|
||
def one_link_urls_with_count(one_link_df, user_column, url_column="url"): | ||
users = [] | ||
for url in one_link_df[url_column].values: | ||
n = one_link_df[one_link_df.url == url][user_column].nunique() | ||
if n != 0: | ||
users.append(n) | ||
|
||
one_link_df["n_users"] = users | ||
|
||
temp_urls = [] | ||
for i in one_link_df[["url", "n_users"]].values: | ||
temp_urls.append((i[0], i[1])) | ||
data_last = dict(Counter(temp_urls)) | ||
links_last = [] | ||
users_last = [] | ||
count_last = [] | ||
for url_users, count in data_last.items(): | ||
links_last.append(url_users[0]) | ||
count_last.append(count) | ||
users_last.append(url_users[1]) | ||
|
||
return ( | ||
pd.DataFrame({"link": links_last, "count": count_last, "n_users": users_last}) | ||
.sort_values("n_users", ascending=False) | ||
.reset_index() | ||
.drop(columns="index") | ||
) | ||
|
||
|
||
def df_more_than_link(df): | ||
return df[df.n_urls > 1].reset_index().drop(columns="index") | ||
|
||
|
||
def more_than_one_link_with_count(df, column_user, column_urls): | ||
user_link_m = [] | ||
for user_links in df[[column_user, column_urls]].values: | ||
for link in user_links[1]: | ||
user_link_m.append((user_links[0], link)) | ||
temp = pd.DataFrame( | ||
{"link": [i[1] for i in user_link_m], "user": [i[0] for i in user_link_m]} | ||
) | ||
links = temp["link"].unique() | ||
last_links = [] | ||
last_u = [] | ||
last_c = [] | ||
for link in links: | ||
last_c.append(temp[temp["link"] == link].shape[0]) | ||
last_u.append(temp[temp["link"] == link]["user"].nunique()) | ||
last_links.append(link) | ||
|
||
return pd.DataFrame({"link": last_links, "count": last_c, "n_users": last_u}) |
Oops, something went wrong.