From f12bd3e16fd3fed5e571ce834fd50a9ebfd49d22 Mon Sep 17 00:00:00 2001 From: neozhu Date: Wed, 13 Mar 2024 13:08:54 +0000 Subject: [PATCH 1/7] =?UTF-8?q?=F0=9F=8D=97=20Add=20new=20feature:=20Imple?= =?UTF-8?q?ment=20Chinese=20label=20recognition=20with=20PaddleOCR?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../examples/paddleocr/.dockerignore | 11 ++ label_studio_ml/examples/paddleocr/Dockerfile | 32 +++++ label_studio_ml/examples/paddleocr/README.md | 125 ++++++++++++++++++ label_studio_ml/examples/paddleocr/_wsgi.py | 119 +++++++++++++++++ .../examples/paddleocr/docker-compose.yml | 27 ++++ .../examples/paddleocr/example.env | 11 ++ .../examples/paddleocr/paddleocr_ch.py | 122 +++++++++++++++++ .../examples/paddleocr/requirements.txt | 13 ++ .../examples/paddleocr/supervisord.conf | 24 ++++ label_studio_ml/examples/paddleocr/uwsgi.ini | 10 ++ 10 files changed, 494 insertions(+) create mode 100644 label_studio_ml/examples/paddleocr/.dockerignore create mode 100644 label_studio_ml/examples/paddleocr/Dockerfile create mode 100644 label_studio_ml/examples/paddleocr/README.md create mode 100644 label_studio_ml/examples/paddleocr/_wsgi.py create mode 100644 label_studio_ml/examples/paddleocr/docker-compose.yml create mode 100644 label_studio_ml/examples/paddleocr/example.env create mode 100644 label_studio_ml/examples/paddleocr/paddleocr_ch.py create mode 100644 label_studio_ml/examples/paddleocr/requirements.txt create mode 100644 label_studio_ml/examples/paddleocr/supervisord.conf create mode 100644 label_studio_ml/examples/paddleocr/uwsgi.ini diff --git a/label_studio_ml/examples/paddleocr/.dockerignore b/label_studio_ml/examples/paddleocr/.dockerignore new file mode 100644 index 00000000..164c0f16 --- /dev/null +++ b/label_studio_ml/examples/paddleocr/.dockerignore @@ -0,0 +1,11 @@ +Dockerfile +README.md +*.pyc +*.pyo +*.pyd +__pycache__ +.pytest_cache +.idea +docker-compose.yml +data +logs \ No newline at end of file diff --git a/label_studio_ml/examples/paddleocr/Dockerfile b/label_studio_ml/examples/paddleocr/Dockerfile new file mode 100644 index 00000000..983e1d57 --- /dev/null +++ b/label_studio_ml/examples/paddleocr/Dockerfile @@ -0,0 +1,32 @@ +FROM python:3.10 + +WORKDIR /tmp +COPY requirements.txt . + +ENV PYTHONUNBUFFERED=True \ + PORT=${PORT:-9090} \ + PIP_CACHE_DIR=/.cache + +# Installing libGL +RUN apt-get update && apt-get install -y \ + libgl1-mesa-dev + +RUN apt-get install -y poppler-utils libpoppler-cpp-dev libglib2.0-0 + + +RUN pip install --upgrade pip \ + && pip install -r requirements.txt + + +#COPY uwsgi.ini /etc/uwsgi/ +COPY supervisord.conf /etc/supervisor/conf.d/ + +WORKDIR /app + +COPY * /app/ + +EXPOSE 9090 + +CMD ["/usr/local/bin/supervisord", \ + "-c", \ + "/etc/supervisor/conf.d/supervisord.conf"] diff --git a/label_studio_ml/examples/paddleocr/README.md b/label_studio_ml/examples/paddleocr/README.md new file mode 100644 index 00000000..0dc24ca4 --- /dev/null +++ b/label_studio_ml/examples/paddleocr/README.md @@ -0,0 +1,125 @@ +## Interactive BBOX OCR using Tesseract +Using an OCR engine for Interactive ML-Assisted Labelling, this functionality +can speed up annotation for layout detection, classification and recognition +models. + +Tesseract is used for OCR but minimal adaptation is needed to connect other OCR +engines or models. + +Tested againt Label Studio 1.10.1, with basic support for both Label Studio +Local File Storage and S3-compatible storage, with a example data storage with +Minio. + +### Setup process +0. Download and install Docker with Docker Compose. For MacOS and Windows users, + we suggest using Docker Desktop. You will also need to have git installed. + +1. Launch LabelStudio. + + ``` + docker run -it \ + -p 8080:8080 \ + -v `pwd`/mydata:/label-studio/data \ + heartexlabs/label-studio:latest + ``` + + Optionally, you may enable local file serving in Label Studio + + ``` + docker run -it \ + -p 8080:8080 \ + -v `pwd`/mydata:/label-studio/data \ + --env LABEL_STUDIO_LOCAL_FILES_SERVING_ENABLED=true \ + --env LABEL_STUDIO_LOCAL_FILES_DOCUMENT_ROOT=/label-studio/data/images \ + heartexlabs/label-studio:latest + ``` + If you're using local file serving, be sure to get a copy of the API token from + Label Studio to connect the model. + +2. Create a new project for Tesseract OCR. In the project **Settings** set up the **Labeling Interface**. + + Fill in the following template code. It's important to specify `smart="true"` in RectangleLabels. + ``` + + + + + + +