Skip to content
This repository was archived by the owner on Dec 15, 2025. It is now read-only.

Commit fc996f0

Browse files
authored
Merge pull request #79 from CambioML/csv_feature
feat: add csv feature to extract_tables
2 parents 891e9e7 + 3972e7a commit fc996f0

File tree

2 files changed

+107
-17
lines changed

2 files changed

+107
-17
lines changed

any_parser/any_parser.py

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
import json
55
import time
66
import uuid
7+
from collections.abc import Iterable
8+
from io import StringIO
79
from pathlib import Path
810

911
import requests
@@ -184,26 +186,75 @@ def extract_pii(
184186
file_type=file_type,
185187
)
186188

189+
@staticmethod
190+
def flatten_to_string(item):
191+
"""
192+
Flatten any iterable object to a string.
193+
"""
194+
195+
if isinstance(item, str):
196+
return item
197+
198+
# if item is a dict, flatten all keys and values
199+
if isinstance(item, dict):
200+
parts = []
201+
for k, v in item.items():
202+
parts.append(AnyParser.flatten_to_string(k))
203+
parts.append(AnyParser.flatten_to_string(v))
204+
return "".join(parts)
205+
206+
# item is other iterable objects
207+
if isinstance(item, Iterable):
208+
parts = []
209+
for sub_item in item:
210+
parts.append(AnyParser.flatten_to_string(sub_item))
211+
return "".join(parts)
212+
213+
# item is not iterable objects
214+
return str(item)
215+
187216
@handle_file_processing
188217
def extract_tables(
189218
self,
190219
file_path=None,
191220
file_content=None,
192221
file_type=None,
222+
return_type="html",
193223
):
194224
"""Extract tables from a file in real-time.
195225
196226
Args:
197227
file_path (str): The path to the file to be parsed.
228+
return_type (str): 'html' or 'csv'
198229
Returns:
199-
tuple(str, str): The extracted data and the time taken.
230+
tuple(str, str)
200231
"""
201-
return self._sync_extract_tables.extract(
232+
extracted_html, time_elapsed = self._sync_extract_tables.extract(
202233
file_path=file_path,
203234
file_content=file_content,
204235
file_type=file_type,
205236
)
206237

238+
if isinstance(extracted_html, list):
239+
extracted_html = AnyParser.flatten_to_string(extracted_html)
240+
241+
if return_type.lower() == "csv":
242+
try:
243+
import pandas as pd
244+
except ImportError:
245+
raise ImportError("Please install pandas to use CSV return_type")
246+
247+
if isinstance(extracted_html, list):
248+
extracted_html = "".join(str(item) for item in extracted_html)
249+
250+
df_list = pd.read_html(StringIO(extracted_html))
251+
combined_df = pd.concat(df_list, ignore_index=True)
252+
csv_output = combined_df.to_csv(index=False)
253+
254+
return csv_output, time_elapsed
255+
256+
return extracted_html, time_elapsed
257+
207258
@handle_file_processing
208259
def extract_key_value(
209260
self,

examples/extract_tables.ipynb

Lines changed: 54 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": null,
5+
"execution_count": 6,
66
"metadata": {},
77
"outputs": [],
88
"source": [
@@ -15,15 +15,23 @@
1515
"cell_type": "code",
1616
"execution_count": 1,
1717
"metadata": {},
18-
"outputs": [],
18+
"outputs": [
19+
{
20+
"name": "stdout",
21+
"output_type": "stream",
22+
"text": [
23+
"/home/ubuntu/any-parser/any_parser/__init__.py\n"
24+
]
25+
}
26+
],
1927
"source": [
2028
"from IPython.display import display, Markdown\n",
2129
"from any_parser import AnyParser"
2230
]
2331
},
2432
{
2533
"cell_type": "code",
26-
"execution_count": null,
34+
"execution_count": 2,
2735
"metadata": {},
2836
"outputs": [],
2937
"source": [
@@ -36,8 +44,13 @@
3644
"metadata": {},
3745
"outputs": [],
3846
"source": [
39-
"file_path = \"./sample_data/test_1figure_1table.png\"\n",
40-
"html_output, time = ap.extract_tables(file_path)"
47+
"csv_output, time_info = ap.extract_tables(\n",
48+
" file_path=\"./sample_data/test_1figure_1table.png\", return_type=\"csv\"\n",
49+
")\n",
50+
"\n",
51+
"html_output, time_info = ap.extract_tables(\n",
52+
" file_path=\"./sample_data/test_1figure_1table.png\", return_type=\"html\"\n",
53+
")"
4154
]
4255
},
4356
{
@@ -46,14 +59,12 @@
4659
"metadata": {},
4760
"outputs": [
4861
{
49-
"data": {
50-
"text/plain": [
51-
"'Time Elapsed: 3.97 seconds'"
52-
]
53-
},
54-
"execution_count": 7,
55-
"metadata": {},
56-
"output_type": "execute_result"
62+
"name": "stdout",
63+
"output_type": "stream",
64+
"text": [
65+
"CPU times: user 2 μs, sys: 0 ns, total: 2 μs\n",
66+
"Wall time: 5.25 μs\n"
67+
]
5768
}
5869
],
5970
"source": [
@@ -62,9 +73,31 @@
6273
},
6374
{
6475
"cell_type": "code",
65-
"execution_count": 9,
76+
"execution_count": 8,
6677
"metadata": {},
6778
"outputs": [
79+
{
80+
"data": {
81+
"text/markdown": [
82+
"0,1,2\n",
83+
",latency,(ms)\n",
84+
"participants,mean,99th percentile\n",
85+
"1,17.0 +1.4,75.0 34.9\n",
86+
"2,24.5 +2.5,87.6 35.9\n",
87+
"5,31.5 +6.2,104.5 52.2\n",
88+
"10,30.0 +3.7,95.6 25.4\n",
89+
"25,35.5 +5.6,100.4 42.7\n",
90+
"50,42.7 +4.1,93.7 22.9\n",
91+
"100,71.4 +7.6,131.2 +17.6\n",
92+
"200,150.5 +11.0,320.3 35.1\n"
93+
],
94+
"text/plain": [
95+
"<IPython.core.display.Markdown object>"
96+
]
97+
},
98+
"metadata": {},
99+
"output_type": "display_data"
100+
},
68101
{
69102
"data": {
70103
"text/markdown": [
@@ -93,6 +126,12 @@
93126
}
94127
],
95128
"source": [
129+
"if isinstance(csv_output, list):\n",
130+
" csv_output_str = \"\\n\".join(csv_output)\n",
131+
"else:\n",
132+
" csv_output_str = csv_output\n",
133+
"\n",
134+
"display(Markdown(csv_output_str))\n",
96135
"display(Markdown(html_output))"
97136
]
98137
}
@@ -113,7 +152,7 @@
113152
"name": "python",
114153
"nbconvert_exporter": "python",
115154
"pygments_lexer": "ipython3",
116-
"version": "-1.-1.-1"
155+
"version": "3.12.2"
117156
}
118157
},
119158
"nbformat": 4,

0 commit comments

Comments
 (0)