Merge pull request #79 from CambioML/csv_feature

lingjiekong · web-flow · commit fc996f0d60f4 · 2025-01-14T09:30:41.000-08:00
feat: add csv feature to extract_tables
diff --git a/any_parser/any_parser.py b/any_parser/any_parser.py
@@ -4,6 +4,8 @@
 import json
 import time
 import uuid
+from collections.abc import Iterable
+from io import StringIO
 from pathlib import Path
 
 import requests
@@ -184,26 +186,75 @@ def extract_pii(
             file_type=file_type,
         )
 
+    @staticmethod
+    def flatten_to_string(item):
+        """
+        Flatten any iterable object to a string.
+        """
+
+        if isinstance(item, str):
+            return item
+
+        # if item is a dict, flatten all keys and values
+        if isinstance(item, dict):
+            parts = []
+            for k, v in item.items():
+                parts.append(AnyParser.flatten_to_string(k))
+                parts.append(AnyParser.flatten_to_string(v))
+            return "".join(parts)
+
+        # item is other iterable objects
+        if isinstance(item, Iterable):
+            parts = []
+            for sub_item in item:
+                parts.append(AnyParser.flatten_to_string(sub_item))
+            return "".join(parts)
+
+        # item is not iterable objects
+        return str(item)
+
     @handle_file_processing
     def extract_tables(
         self,
         file_path=None,
         file_content=None,
         file_type=None,
+        return_type="html",
     ):
         """Extract tables from a file in real-time.
 
         Args:
             file_path (str): The path to the file to be parsed.
+            return_type (str): 'html' or 'csv'
         Returns:
-            tuple(str, str): The extracted data and the time taken.
+            tuple(str, str)
         """
-        return self._sync_extract_tables.extract(
+        extracted_html, time_elapsed = self._sync_extract_tables.extract(
             file_path=file_path,
             file_content=file_content,
             file_type=file_type,
         )
 
+        if isinstance(extracted_html, list):
+            extracted_html = AnyParser.flatten_to_string(extracted_html)
+
+        if return_type.lower() == "csv":
+            try:
+                import pandas as pd
+            except ImportError:
+                raise ImportError("Please install pandas to use CSV return_type")
+
+            if isinstance(extracted_html, list):
+                extracted_html = "".join(str(item) for item in extracted_html)
+
+            df_list = pd.read_html(StringIO(extracted_html))
+            combined_df = pd.concat(df_list, ignore_index=True)
+            csv_output = combined_df.to_csv(index=False)
+
+            return csv_output, time_elapsed
+
+        return extracted_html, time_elapsed
+
     @handle_file_processing
     def extract_key_value(
         self,
diff --git a/examples/extract_tables.ipynb b/examples/extract_tables.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -15,15 +15,23 @@
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/home/ubuntu/any-parser/any_parser/__init__.py\n"
+     ]
+    }
+   ],
    "source": [
     "from IPython.display import display, Markdown\n",
     "from any_parser import AnyParser"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -36,8 +44,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "file_path = \"./sample_data/test_1figure_1table.png\"\n",
-    "html_output, time = ap.extract_tables(file_path)"
+    "csv_output, time_info = ap.extract_tables(\n",
+    "    file_path=\"./sample_data/test_1figure_1table.png\", return_type=\"csv\"\n",
+    ")\n",
+    "\n",
+    "html_output, time_info = ap.extract_tables(\n",
+    "    file_path=\"./sample_data/test_1figure_1table.png\", return_type=\"html\"\n",
+    ")"
    ]
   },
   {
@@ -46,14 +59,12 @@
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "'Time Elapsed: 3.97 seconds'"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 2 μs, sys: 0 ns, total: 2 μs\n",
+      "Wall time: 5.25 μs\n"
+     ]
     }
    ],
    "source": [
@@ -62,9 +73,31 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "0,1,2\n",
+       ",latency,(ms)\n",
+       "participants,mean,99th percentile\n",
+       "1,17.0 +1.4,75.0 34.9\n",
+       "2,24.5 +2.5,87.6 35.9\n",
+       "5,31.5 +6.2,104.5 52.2\n",
+       "10,30.0 +3.7,95.6 25.4\n",
+       "25,35.5 +5.6,100.4 42.7\n",
+       "50,42.7 +4.1,93.7 22.9\n",
+       "100,71.4 +7.6,131.2 +17.6\n",
+       "200,150.5 +11.0,320.3 35.1\n"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "data": {
       "text/markdown": [
@@ -93,6 +126,12 @@
     }
    ],
    "source": [
+    "if isinstance(csv_output, list):\n",
+    "    csv_output_str = \"\\n\".join(csv_output)\n",
+    "else:\n",
+    "    csv_output_str = csv_output\n",
+    "\n",
+    "display(Markdown(csv_output_str))\n",
     "display(Markdown(html_output))"
    ]
   }
@@ -113,7 +152,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "-1.-1.-1"
+   "version": "3.12.2"
   }
  },
  "nbformat": 4,