Skip to content

Commit 484442e

Browse files
Merge branch 'main' into llm-translate
2 parents 7657340 + efe76b9 commit 484442e

File tree

470 files changed

+71857
-19810
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

470 files changed

+71857
-19810
lines changed

.github/actions/test-template/action.yml

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,8 @@ runs:
131131
env:
132132
GH_TOKEN: ${{ inputs.PAT }}
133133
run: |
134-
docker build -f Dockerfile -t curator .
134+
docker system prune -af
135+
docker build -f ray-curator/docker/Dockerfile -t ray-curator .
135136
136137
- name: Start container
137138
shell: bash
@@ -155,7 +156,7 @@ runs:
155156
--volume $(pwd)/NeMo-Curator:/workspace \
156157
--workdir /workspace \
157158
--volume $MNT_PATH/TestData:/home/TestData \
158-
curator \
159+
ray-curator \
159160
bash -c "sleep $(( ${{ inputs.timeout }} * 60 + 60 ))"
160161
RUN_TEST_EOF
161162
)
@@ -176,9 +177,8 @@ runs:
176177
docker exec -t nemo_container_${{ github.run_id }} bash -c '
177178
set -e
178179
179-
source activate /opt/conda/envs/curator
180-
181-
bash tests/${{ inputs.script }}.sh
180+
. /opt/venv/env.sh
181+
bash ray-curator/tests/${{ inputs.script }}.sh
182182
'
183183
184184
RUN_TEST_EOF
@@ -202,13 +202,13 @@ runs:
202202
id: check
203203
shell: bash
204204
run: |
205-
docker exec nemo_container_${{ github.run_id }} coverage combine || true
206-
docker exec nemo_container_${{ github.run_id }} coverage xml || true
207-
docker cp nemo_container_${{ github.run_id }}:/workspace/.coverage .coverage
208-
docker cp nemo_container_${{ github.run_id }}:/workspace/coverage.xml coverage.xml
205+
# docker exec nemo_container_${{ github.run_id }} coverage combine || true
206+
# docker exec nemo_container_${{ github.run_id }} coverage xml || true
207+
# docker cp nemo_container_${{ github.run_id }}:/workspace/.coverage .coverage
208+
# docker cp nemo_container_${{ github.run_id }}:/workspace/coverage.xml coverage.xml
209209
210-
coverage_report=coverage-${{ steps.create.outputs.coverage-prefix }}-${{ github.run_id }}-$(uuidgen)
211-
echo "coverage_report=$coverage_report" >> "$GITHUB_OUTPUT"
210+
# coverage_report=coverage-${{ steps.create.outputs.coverage-prefix }}-${{ github.run_id }}-$(uuidgen)
211+
# echo "coverage_report=$coverage_report" >> "$GITHUB_OUTPUT"
212212
213213
EXIT_CODE=${{ steps.run-main-script.outputs.exit_code }}
214214
IS_SUCCESS=$([[ "$EXIT_CODE" -eq 0 ]] && echo "true" || echo "false")
@@ -225,17 +225,17 @@ runs:
225225
226226
exit $EXIT_CODE
227227
228-
- name: Test coverage
229-
shell: bash -x -e -u -o pipefail {0}
230-
run: |
231-
docker exec -t nemo_container_${{ github.run_id }} coverage report -i
232-
233-
- name: Upload artifacts
234-
uses: actions/upload-artifact@v4
235-
if: ${{ steps.check.outputs.coverage_report != 'none' }}
236-
with:
237-
name: ${{ steps.check.outputs.coverage_report }}
238-
path: |
239-
coverage.xml
240-
.coverage
241-
include-hidden-files: true
228+
# - name: Test coverage
229+
# shell: bash -x -e -u -o pipefail {0}
230+
# run: |
231+
# docker exec -t nemo_container_${{ github.run_id }} coverage report -i
232+
233+
# - name: Upload artifacts
234+
# uses: actions/upload-artifact@v4
235+
# if: ${{ steps.check.outputs.coverage_report != 'none' }}
236+
# with:
237+
# name: ${{ steps.check.outputs.coverage_report }}
238+
# path: |
239+
# coverage.xml
240+
# .coverage
241+
# include-hidden-files: true

.github/workflows/auto-labler.yml

Lines changed: 0 additions & 18 deletions
This file was deleted.

.github/workflows/cherry-pick-release-commit.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@ on:
88
jobs:
99
cherry-pick:
1010
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/[email protected]
11-
with:
12-
target-branches-pattern: 'r[0-9]+\.[0-9]+\.[0-9]+|ray-api'
1311
secrets:
1412
PAT: ${{ secrets.PAT }}
1513
SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}

.github/workflows/cicd-main.yml

Lines changed: 21 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ jobs:
5959
uses: step-security/[email protected]
6060
with:
6161
files: |
62+
ray-curator/**
6263
nemo_curator/**
6364
.github/**
6465
pyproject.toml
@@ -148,33 +149,35 @@ jobs:
148149
- name: Install NeMo-Curator and pytest
149150
run: |
150151
pip install -U pip
151-
pip install --no-cache-dir wheel
152-
pip install --no-cache-dir .
153-
pip install --no-cache-dir pytest pytest-asyncio coverage
152+
cd ray-curator
153+
pip install --no-cache-dir ".[text]"
154+
pip install --no-cache-dir ".[video]"
155+
pip install --no-cache-dir "internvideo2-multi-modality @ git+https://github.com/suiyoubi/InternVideo.git@curator#subdirectory=InternVideo2/multi_modality"
156+
pip install --no-cache-dir --group test
154157
155-
- name: Run tests
158+
- name: Run tests (CPU)
156159
run: |
157-
python -m coverage run --branch --source=nemo_curator --omit=nemo_curator/scripts/* -m pytest -v --cpu
160+
cd ray-curator
161+
python -m coverage run --branch --source=ray_curator -m pytest -v tests -m "not gpu"
158162
159163
- name: Generate report
160164
id: check
161165
shell: bash
162166
run: |
167+
cd ray-curator
163168
python -m coverage xml
164169
python -m coverage report
165-
166170
coverage_report=coverage-unit-test-${{ github.run_id }}-$(uuidgen)
167171
echo "$coverage_report"
168172
echo "coverage_report=$coverage_report" >> "$GITHUB_OUTPUT"
169-
170173
- name: Upload artifacts
171174
uses: actions/upload-artifact@v4
172175
if: ${{ steps.check.outputs.coverage_report != 'none' }}
173176
with:
174177
name: ${{ steps.check.outputs.coverage_report }}
175178
path: |
176-
coverage.xml
177-
.coverage
179+
ray-curator/coverage.xml
180+
ray-curator/.coverage
178181
include-hidden-files: true
179182

180183
cicd-gpu-tests:
@@ -237,18 +240,15 @@ jobs:
237240
# Get workflow run details and check job conclusions
238241
NUM_FAILED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "failure") | .name] | length')
239242
NUM_CANCELLED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "cancelled") | .name] | length')
240-
241243
if [[ ($NUM_FAILED -eq 0 && $NUM_CANCELLED -eq 0) || $DOCS_ONLY == 'true' ]]; then
242244
RESULT="success"
243245
elif [[ $NUM_CANCELLED -gt 0 ]]; then
244246
RESULT="cancelled"
245247
else
246248
RESULT="failure"
247249
fi
248-
249250
# Output the final status
250251
echo "code=$RESULT" | tee -a $GITHUB_OUTPUT
251-
252252
- name: Checkout for GH CLI
253253
uses: actions/checkout@v4
254254

@@ -276,13 +276,9 @@ jobs:
276276
issue-number: ${{ github.event.number }}
277277
body: |
278278
[🤖]: Hi @${{ github.event.pull_request.user.login }} 👋,
279-
280279
We wanted to let you know that a [CICD pipeline](https://github.com/${{ env.REPOSITORY }}/actions/runs/${{ env.RUN_ID }}) for this PR just finished successfully.
281-
282280
So it might be time to merge this PR or get some approvals.
283-
284281
//cc @NVIDIA-NeMo/automation
285-
286282
- name: "Pipeline not successful and not cancelled: Send Slack alert & create step summary"
287283
if: |
288284
steps.result.outputs.code == 'failure'
@@ -299,9 +295,7 @@ jobs:
299295
set -x
300296
pip install PyGithub
301297
export BRANCH_NAME=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}
302-
303298
python .github/scripts/notify.py
304-
305299
- name: Exit
306300
if: ${{ always() }}
307301
env:
@@ -326,7 +320,7 @@ jobs:
326320
&& !cancelled()
327321
strategy:
328322
matrix:
329-
flag: [unit-test, e2e]
323+
flag: [unit-test]
330324
steps:
331325
- name: Checkout
332326
uses: actions/checkout@v4
@@ -341,7 +335,6 @@ jobs:
341335
if: always()
342336
run: |
343337
pip install coverage[toml]
344-
345338
ls -al .
346339
ls -al coverage-*/
347340
coverage combine --keep $(ls coverage-*/.coverage)
@@ -365,11 +358,11 @@ jobs:
365358
include-hidden-files: true
366359

367360
codecov-placeholder:
368-
name: codecov/patch
369-
needs: [pre-flight]
370-
if: needs.pre-flight.outputs.docs_only == 'true'
371-
runs-on: ubuntu-latest
372-
steps:
373-
- name: codecov_placeholder
374-
run: |
375-
echo "This is a placeholder status check for when no tests are ran but the codecov status is expected"
361+
name: codecov/patch
362+
needs: [pre-flight]
363+
if: needs.pre-flight.outputs.docs_only == 'true'
364+
runs-on: ubuntu-latest
365+
steps:
366+
- name: codecov_placeholder
367+
run: |
368+
echo "This is a placeholder status check for when no tests are ran but the codecov status is expected"

docs/_templates/autodoc2_index.rst

Lines changed: 25 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -6,87 +6,62 @@ NeMo Curator's API reference provides comprehensive technical documentation for
66
.. grid:: 1 2 2 2
77
:gutter: 3
88

9-
.. grid-item-card:: :octicon:`database;1.5em;sd-mr-1` Core Data Handling
10-
:link: datasets/datasets
9+
.. grid-item-card:: :octicon:`server;1.5em;sd-mr-1` Execution Backends
10+
:link: backends/backends
1111
:link-type: doc
1212
:class-card: sd-border-0
1313

14-
**Datasets & Download**
14+
**Ray-based execution backends**
1515

16-
Essential classes for loading, managing, and downloading training data from various sources.
16+
Adapters and executors for running pipelines at scale.
1717

18-
:bdg-secondary:`doc-dataset` :bdg-secondary:`parallel-dataset` :bdg-secondary:`arxiv` :bdg-secondary:`commoncrawl`
18+
:bdg-secondary:`ray-data` :bdg-secondary:`xenna`
1919

20-
.. grid-item-card:: :octicon:`filter;1.5em;sd-mr-1` Data Processing
21-
:link: filters/filters
20+
.. grid-item-card:: :octicon:`workflow;1.5em;sd-mr-1` Pipeline
21+
:link: pipeline/pipeline
2222
:link-type: doc
2323
:class-card: sd-border-0
2424

25-
**Filters & Modifiers**
25+
**Orchestrate end-to-end workflows**
2626

27-
Tools for cleaning, filtering, and transforming text data to improve quality and remove unwanted content.
28-
29-
:bdg-secondary:`classifier-filter` :bdg-secondary:`heuristic-filter` :bdg-secondary:`pii-modifier`
27+
Build and run pipelines composed of processing stages.
3028

31-
.. grid-item-card:: :octicon:`code;1.5em;sd-mr-1` Classification & Analysis
32-
:link: classifiers/classifiers
29+
.. grid-item-card:: :octicon:`stack;1.5em;sd-mr-1` Processing Stages
30+
:link: stages/stages
3331
:link-type: doc
3432
:class-card: sd-border-0
3533

36-
**AI-Powered Analysis**
34+
**Download, transform, and write data**
3735

38-
Advanced classification tools and image processing capabilities for content analysis and quality assessment.
36+
Modular stages for download/extract, text models/classifiers, I/O, and utilities.
3937

40-
:bdg-secondary:`aegis` :bdg-secondary:`content-type` :bdg-secondary:`domain-classifier`
38+
:bdg-secondary:`download` :bdg-secondary:`text` :bdg-secondary:`io` :bdg-secondary:`modules`
4139

42-
.. grid-item-card:: :octicon:`shield-check;1.5em;sd-mr-1` Privacy & Security
43-
:link: pii/pii
40+
.. grid-item-card:: :octicon:`tasklist;1.5em;sd-mr-1` Tasks
41+
:link: tasks/tasks
4442
:link-type: doc
4543
:class-card: sd-border-0
4644

47-
**PII Detection & Redaction**
45+
**Core data structures**
4846

49-
Identify and handle personally identifiable information in datasets with advanced recognition algorithms.
50-
51-
:bdg-secondary:`recognizers` :bdg-secondary:`algorithms` :bdg-secondary:`redaction`
47+
Document batches, file groups, and related interfaces passed between stages.
5248

53-
.. grid-item-card:: :octicon:`zap;1.5em;sd-mr-1` Synthetic Data
54-
:link: synthetic/synthetic
49+
.. grid-item-card:: :octicon:`gear;1.5em;sd-mr-1` Utilities
50+
:link: utils/utils
5551
:link-type: doc
5652
:class-card: sd-border-0
5753

58-
**Data Generation**
54+
**Helper functions**
5955

60-
Create high-quality synthetic training data using advanced language models and generation techniques.
61-
62-
:bdg-secondary:`generator` :bdg-secondary:`nemotron` :bdg-secondary:`mixtral`
63-
64-
.. grid-item-card:: :octicon:`tools;1.5em;sd-mr-1` Advanced Processing
65-
:link: modules/modules
66-
:link-type: doc
67-
:class-card: sd-border-0
68-
69-
**Deduplication & Modules**
70-
71-
Advanced processing modules including semantic deduplication, fuzzy matching, and data pipeline components.
72-
73-
:bdg-secondary:`semantic-dedup` :bdg-secondary:`fuzzy-dedup` :bdg-secondary:`add-id`
56+
File, performance, and operation utilities used across the pipeline.
7457

7558
.. toctree::
7659
:maxdepth: 1
7760
:caption: API Modules
7861
:hidden:
7962

80-
datasets/datasets
81-
download/download
82-
filters/filters
83-
modifiers/modifiers
84-
modules/modules
85-
classifiers/classifiers
86-
image/image
87-
pii/pii
88-
synthetic/synthetic
89-
services/services
90-
nemo_run/nemo_run
63+
backends/backends
64+
pipeline/pipeline
65+
stages/stages
9166
tasks/tasks
9267
utils/utils

docs/about/index.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ modality: "universal"
1313

1414
NeMo Curator is an open-source, enterprise-grade platform for scalable, privacy-aware data curation across text, image, and video modalities.
1515

16-
NeMo Curator helps you prepare high-quality, compliant datasets for large language model (LLM) and generative artificial intelligence (AI) training. Whether you work in the cloud, on-premises, or in a hybrid environment, NeMo Curator supports your workflow.
16+
NeMo Curator, part of the NVIDIA NeMo software suite for managing the AI agent lifecycle, helps you prepare high-quality, compliant datasets for large language model (LLM) and generative artificial intelligence (AI) training. Whether you work in the cloud, on-premises, or in a hybrid environment, NeMo Curator supports your workflow.
1717

1818
## Target Users
1919

0 commit comments

Comments
 (0)