Skip to content

Commit

Permalink
Kulturnav Entititer
Browse files Browse the repository at this point in the history
  • Loading branch information
salgo60 committed Jun 30, 2020
1 parent 9e06cdb commit 9f5988a
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 39 deletions.
4 changes: 2 additions & 2 deletions .ipynb_checkpoints/Riksdagens ledamöter-checkpoint.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
"* Wikipedia [WikiProject_Sweden/Swedish_Riksdag_documents](https://www.wikidata.org/wiki/Wikidata:WikiProject_Sweden/Swedish_Riksdag_documents)\n",
"\n",
"\n",
"* intressent_id = Wikidata Property P1214 \n",
"* sourceid = Wikidata Property P8388 \n"
"* intressent_id = Wikidata Property [P1214](https://www.wikidata.org/wiki/Property:P1214) \n",
"* sourceid = Wikidata Property [P8388](https://www.wikidata.org/wiki/Property:P8388) \n"
]
},
{
Expand Down
127 changes: 92 additions & 35 deletions Kulturnav - Entity.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -4,34 +4,50 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Kulturnav - Entity"
"## Kulturnav - Entity \n",
"* [This Notebook](https://github.com/salgo60/open-data-examples/blob/master/Kulturnav%20-%20Entity.ipynb)\n",
"* [API](https://kulturnav.org/info/api)\n",
"* [blogpost](https://minancestry.blogspot.com/2020/06/draft-kulturnav.html)"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "82ca4700ec744398afc3c76ceb62ab64",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"************************"
"\n"
]
}
],
"source": [
"import urllib3, json\n",
"import pandas as pd \n",
"import pandas as pd \n",
"from tqdm.notebook import trange \n",
"http = urllib3.PoolManager() \n",
"pd.set_option(\"display.max.columns\", None) \n",
"urlbase = \"https://kulturnav.org/api/search/entityType:Entity?start=\" \n",
"\n",
"dftot = pd.DataFrame()\n",
"for i in range(1,500000,20):\n",
"for i in trange(1,500000,20):\n",
" url = urlbase + str(i)\n",
" if ((i-1) % 10000) == 0:\n",
" print('*', end='', flush=True)\n",
" r = http.request('GET', url)\n",
" try:\n",
" data = json.loads(r.data)\n",
Expand All @@ -42,32 +58,32 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 100000 entries, 0 to 19\n",
"Int64Index: 500000 entries, 0 to 19\n",
"Data columns (total 14 columns):\n",
"uuid 100000 non-null object\n",
"entityType 100000 non-null object\n",
"acl 100000 non-null object\n",
"currentVersion 100000 non-null bool\n",
"createdAt 100000 non-null object\n",
"createdBy 100000 non-null object\n",
"updatedAt 60000 non-null object\n",
"updatedBy 60000 non-null object\n",
"savedSource 85000 non-null object\n",
"inline 100000 non-null bool\n",
"caption 100000 non-null object\n",
"properties 100000 non-null object\n",
"entityTypeName 100000 non-null object\n",
"entityTypeHierarchy 100000 non-null object\n",
"uuid 500000 non-null object\n",
"entityType 500000 non-null object\n",
"acl 500000 non-null object\n",
"currentVersion 500000 non-null bool\n",
"createdAt 500000 non-null object\n",
"createdBy 500000 non-null object\n",
"updatedAt 300000 non-null object\n",
"updatedBy 300000 non-null object\n",
"savedSource 425000 non-null object\n",
"inline 500000 non-null bool\n",
"caption 500000 non-null object\n",
"properties 500000 non-null object\n",
"entityTypeName 500000 non-null object\n",
"entityTypeHierarchy 500000 non-null object\n",
"dtypes: bool(2), object(12)\n",
"memory usage: 10.1+ MB\n"
"memory usage: 50.5+ MB\n"
]
}
],
Expand All @@ -77,7 +93,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 10,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -131,7 +147,7 @@
" <td>SCRIPT: extractDrafts</td>\n",
" <td>False</td>\n",
" <td>{}</td>\n",
" <td>{'dataset.primaryLanguage': [{'uuid': '734bae0...</td>\n",
" <td>{'dataset.primaryLanguage': [{'uuid': '63c88b1...</td>\n",
" <td>{'nn': 'Organisasjon', 'no': 'Organisasjon', '...</td>\n",
" <td>[Agent, Organization]</td>\n",
" </tr>\n",
Expand Down Expand Up @@ -307,7 +323,7 @@
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>100000 rows × 14 columns</p>\n",
"<p>500000 rows × 14 columns</p>\n",
"</div>"
],
"text/plain": [
Expand Down Expand Up @@ -377,7 +393,7 @@
"19 INTERACTIVE False {'sv': '__DIMU_STAGING (KulturIT AS)'} \n",
"\n",
" properties \\\n",
"0 {'dataset.primaryLanguage': [{'uuid': '734bae0... \n",
"0 {'dataset.primaryLanguage': [{'uuid': '63c88b1... \n",
"1 {'entity.dataset': [{'uuid': 'deecb7d7-8a3f-46... \n",
"2 {'dataset.privateAcl': [{'uuid': '855a7e66-e66... \n",
"3 {'entity.dataset': [{'uuid': '59978ea2-2dc8-47... \n",
Expand All @@ -402,10 +418,10 @@
"18 {'nn': 'Mappe', 'no': 'Mappe', 'sv': 'Mapp', '... [Dataset, List] \n",
"19 {'nn': 'Mappe', 'no': 'Mappe', 'sv': 'Mapp', '... [Dataset, List] \n",
"\n",
"[100000 rows x 14 columns]"
"[500000 rows x 14 columns]"
]
},
"execution_count": 4,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -416,19 +432,19 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"List 80000\n",
"Person 15000\n",
"Organization 5000\n",
"List 400000\n",
"Person 75000\n",
"Organization 25000\n",
"Name: entityType, dtype: int64"
]
},
"execution_count": 5,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -437,6 +453,47 @@
"dftot.entityType.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"ename": "SyntaxError",
"evalue": "invalid syntax (<ipython-input-12-9f5d15fb5bbd>, line 1)",
"output_type": "error",
"traceback": [
"\u001b[0;36m File \u001b[0;32m\"<ipython-input-12-9f5d15fb5bbd>\"\u001b[0;36m, line \u001b[0;32m1\u001b[0m\n\u001b[0;31m personE = dftot[dftot[\"entityType\"]=\"Person\"]\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
]
}
],
"source": [
"personE = dftot[dftot[\"entityType\"]=\"Person\"] "
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 1800x360 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%matplotlib inline \n",
"import matplotlib.pyplot as plt \n",
"plot2 = dftot.entityType.value_counts().plot.pie(y='counts', figsize=(25, 5)) \n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
Loading

0 comments on commit 9f5988a

Please sign in to comment.