-
Notifications
You must be signed in to change notification settings - Fork 537
Expand file tree
/
Copy pathDockerfile.delft
More file actions
164 lines (125 loc) · 6.15 KB
/
Dockerfile.delft
File metadata and controls
164 lines (125 loc) · 6.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
## Docker GROBID image using deep learning models and/or CRF models
## See https://grobid.readthedocs.io/en/latest/Grobid-docker/
## usage example with version 0.8.0:
## docker build -t grobid/grobid:0.8.0 --build-arg GROBID_VERSION=0.8.0 --file Dockerfile.delft .
## no GPU:
## docker run -t --rm --init -p 8070:8070 -p 8071:8071 -v /home/lopez/grobid/grobid-home/config/grobid.properties:/opt/grobid/grobid-home/config/grobid.properties:ro grobid/grobid:0.8.0
## allocate all available GPUs (only Linux with proper nvidia driver installed on host machine):
## docker run --rm --gpus all --init -p 8070:8070 -p 8071:8071 -v /home/lopez/grobid/grobid-home/config/grobid.properties:/opt/grobid/grobid-home/config/grobid.properties:ro grobid/grobid:0.8.0
# -------------------
# build builder image
# -------------------
FROM eclipse-temurin:21.0.10_7-jdk AS builder
USER root
RUN apt-get update && \
apt-get -y upgrade && \
apt-get -y --no-install-recommends install unzip git
WORKDIR /opt/grobid-source
# Layer 1: Gradle wrapper + build config (rarely changes -> long-lived cache)
COPY gradle/ ./gradle/
COPY gradlew ./
COPY gradle.properties ./
COPY build.gradle ./
COPY settings.gradle ./
# Layer 2: Local libs needed for dependency resolution
COPY grobid-core/localLibs/ ./grobid-core/localLibs/
# Layer 3: Create subproject dirs so Gradle can configure without source
RUN mkdir -p grobid-home grobid-service grobid-trainer
# Layer 4: Download all dependencies (cached until build.gradle changes)
RUN ./gradlew dependencies --no-daemon -q
# Layer 5: Source code (changes every commit -> cache bust from here)
COPY grobid-home/ ./grobid-home/
COPY grobid-core/ ./grobid-core/
COPY grobid-service/ ./grobid-service/
COPY grobid-trainer/ ./grobid-trainer/
ARG TARGETARCH
# Layer 6: .git for revision embedding (as late as possible, just before build)
COPY .git/ ./.git
# cleaning unused native libraries before packaging
RUN rm -rf grobid-home/pdf2xml
RUN rm -rf grobid-home/pdfalto/win-* grobid-home/pdfalto/mac-64 grobid-home/pdfalto/mac_arm-64 grobid-home/pdfalto/lin-32
RUN rm -rf grobid-home/lib/win-* grobid-home/lib/mac-64 grobid-home/lib/mac_arm-64 grobid-home/lib/lin-32
RUN if [ "$TARGETARCH" = "arm64" ]; then \
rm -rf grobid-home/pdfalto/lin-64 grobid-home/lib/lin-64; \
else \
rm -rf grobid-home/pdfalto/lin_arm-64 grobid-home/lib/lin_arm-64; \
fi
# Setting DL-powered configuration
RUN rm grobid-home/config/grobid.yaml && \
mv grobid-home/config/grobid-full.yaml grobid-home/config/grobid.yaml
RUN ./gradlew clean assemble --no-daemon --info --stacktrace
WORKDIR /opt/grobid
RUN unzip -o /opt/grobid-source/grobid-service/build/distributions/grobid-service-*.zip && \
mv grobid-service* grobid-service
RUN unzip -o /opt/grobid-source/grobid-home/build/distributions/grobid-home-*.zip && \
chmod -R 755 /opt/grobid/grobid-home/pdfalto
RUN rm -rf grobid-source
# -------------------
# build runtime image
# -------------------
FROM eclipse-temurin:21.0.10_7-jre
# setting locale is likely useless but to be sure
ENV LANG=C.UTF-8
# install system dependencies and Python 3.11 via deadsnakes PPA
RUN apt-get update && \
apt-get -y --no-install-recommends install \
bash build-essential gcc libxml2 libfontconfig unzip curl \
musl gfortran software-properties-common && \
add-apt-repository ppa:deadsnakes/ppa && \
apt-get update && \
apt-get -y --no-install-recommends install \
python3.11 python3.11-dev python3.11-venv python3.11-distutils && \
update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# bootstrap pip for Python 3.11
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
# install TensorFlow: GPU-enabled on amd64 (with CUDA via pip), CPU-only on arm64
ARG TARGETARCH
RUN if [ "$TARGETARCH" = "amd64" ]; then \
pip3 install --no-cache-dir "tensorflow[and-cuda]==2.17.1"; \
else \
pip3 install --no-cache-dir tensorflow==2.17.1; \
fi
WORKDIR /opt/grobid
COPY --from=builder /opt/grobid .
# install DeLFT
RUN pip3 install --no-cache-dir delft==0.4.5
# Build JEP with temporary access to full JDK (needs javac + JNI headers).
# The bind mount is only visible during this RUN step — zero overhead in the final image.
RUN --mount=from=builder,source=/opt/java/openjdk,target=/opt/java/openjdk-jdk \
JAVA_HOME=/opt/java/openjdk-jdk pip3 install --no-cache-dir jep==4.3.1
# link the data directory to /data
# the current working directory will most likely be /opt/grobid
RUN mkdir -p /data \
&& ln -s /data /opt/grobid/data \
&& ln -s /data ./data
# disable python warnings (and fix logging)
ENV PYTHONWARNINGS="ignore"
ENV JAVA_OPTS=-Xmx4g
# set python version and library paths (both arch paths included; only one will exist at runtime)
ENV PYTHON_VERSION=3.11
ENV LD_LIBRARY_PATH=/usr/local/lib/python3.11/dist-packages/jep:grobid-home/lib/lin-64:grobid-home/lib/lin_arm-64
ENV GROBID_SERVICE_OPTS="--add-opens java.base/java.lang=ALL-UNNAMED --add-opens java.base/sun.nio.ch=ALL-UNNAMED --add-opens java.base/java.io=ALL-UNNAMED"
# Add Tini
ENV TINI_VERSION=v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-${TARGETARCH} /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "-s", "--"]
WORKDIR /opt/grobid
# preload embeddings, for GROBID all the RNN models use glove-840B (default for the script), ELMo is currently not loaded
# to be done: mechanism to download GROBID fine-tuned models based on SciBERT if selected (but not good enough for the moment)
COPY --from=builder /opt/grobid-source/grobid-home/scripts/preload_embeddings.py .
COPY --from=builder /opt/grobid-source/grobid-home/config/resources-registry.json .
RUN python3 preload_embeddings.py --registry ./resources-registry.json && \
ln -s /opt/grobid /opt/delft
RUN mkdir delft && \
cp ./resources-registry.json delft/
CMD ["./grobid-service/bin/grobid-service"]
ARG GROBID_VERSION
LABEL \
authors="The contributors" \
org.label-schema.name="GROBID" \
org.label-schema.description="Image with GROBID service" \
org.label-schema.url="https://github.com/kermitt2/grobid" \
org.label-schema.version=${GROBID_VERSION}