Skip to content

Commit 76efcf4

Browse files
authored
chore: add shfmt (#2246)
### Description Given all the shell files that now exist in the repo, would be nice to have linting/formatting around them (in addition to the existing shellcheck which doesn't do anything to format the shell code). This PR introduces `shfmt` to both check for changes and apply formatting when the associated make targets are called.
1 parent 529d1f6 commit 76efcf4

File tree

117 files changed

+2380
-2412
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

117 files changed

+2380
-2412
lines changed

.github/workflows/ci.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,15 @@ jobs:
7171
- name: ShellCheck
7272
uses: ludeeus/action-shellcheck@master
7373

74+
shfmt:
75+
runs-on: ubuntu-latest
76+
steps:
77+
- uses: actions/checkout@v3
78+
- name: setup shfmt
79+
uses: mfinelli/setup-shfmt@v3
80+
- name: Run shfmt
81+
run: shfmt -d .
82+
7483

7584
test_unit:
7685
strategy:

Makefile

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,10 @@ test-extra-xlsx:
345345
.PHONY: check
346346
check: check-ruff check-black check-flake8 check-version check-flake8-print
347347

348+
.PHONY: check-shfmt
349+
check-shfmt:
350+
shfmt -d .
351+
348352
.PHONY: check-black
349353
check-black:
350354
black . --check
@@ -382,7 +386,14 @@ check-version:
382386

383387
## tidy: run black
384388
.PHONY: tidy
385-
tidy:
389+
tidy: tidy-python
390+
391+
.PHONY: tidy_shell
392+
tidy-shell:
393+
shfmt -l -w .
394+
395+
.PHONY: tidy-python
396+
tidy-python:
386397
ruff . --select C4,COM,E,F,I,PLR0402,PT,SIM,UP015,UP018,UP032,UP034 --fix-only --ignore COM812,PT011,PT012,SIM117 || true
387398
autoflake --in-place .
388399
black .

examples/ingest/airtable/ingest.sh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,9 @@ cd "$SCRIPT_DIR"/../../.. || exit 1
4343
# base1/view1 → has to mention table to be valid
4444

4545
PYTHONPATH=. ./unstructured/ingest/main.py \
46-
airtable \
47-
--metadata-exclude filename,file_directory,metadata.data_source.date_processed \
48-
--personal-access-token "$AIRTABLE_PERSONAL_ACCESS_TOKEN" \
49-
--output-dir airtable-ingest-output \
50-
--num-processes 2 \
51-
--reprocess
46+
airtable \
47+
--metadata-exclude filename,file_directory,metadata.data_source.date_processed \
48+
--personal-access-token "$AIRTABLE_PERSONAL_ACCESS_TOKEN" \
49+
--output-dir airtable-ingest-output \
50+
--num-processes 2 \
51+
--reprocess

examples/ingest/azure/ingest.sh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,12 @@
55

66
# Structured outputs are stored in azure-ingest-output/
77

8-
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
8+
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
99
cd "$SCRIPT_DIR"/../../.. || exit 1
1010

1111
PYTHONPATH=. ./unstructured/ingest/main.py \
12-
azure \
13-
--remote-url abfs://container1/ \
14-
--account-name azureunstructured1 \
15-
--output-dir azure-ingest-output \
16-
--num-processes 2
12+
azure \
13+
--remote-url abfs://container1/ \
14+
--account-name azureunstructured1 \
15+
--output-dir azure-ingest-output \
16+
--num-processes 2

examples/ingest/azure_cognitive_search/ingest.sh

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,18 @@
55

66
# Structured outputs are stored in azure-ingest-output/
77

8-
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
8+
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
99
cd "$SCRIPT_DIR"/../../.. || exit 1
1010

1111
PYTHONPATH=. ./unstructured/ingest/main.py \
12-
s3 \
13-
--remote-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
14-
--anonymous \
15-
--output-dir s3-small-batch-output-to-azure \
16-
--num-processes 2 \
17-
--verbose \
18-
--strategy fast \
19-
azure-cognitive-search \
20-
--key "$AZURE_SEARCH_API_KEY" \
21-
--endpoint "$AZURE_SEARCH_ENDPOINT" \
22-
--index utic-test-ingest-fixtures-output
12+
s3 \
13+
--remote-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
14+
--anonymous \
15+
--output-dir s3-small-batch-output-to-azure \
16+
--num-processes 2 \
17+
--verbose \
18+
--strategy fast \
19+
azure-cognitive-search \
20+
--key "$AZURE_SEARCH_API_KEY" \
21+
--endpoint "$AZURE_SEARCH_ENDPOINT" \
22+
--index utic-test-ingest-fixtures-output

examples/ingest/biomed/ingest-with-api.sh

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,18 +13,17 @@
1313
# For example, to download documents from 2019-01-02 00:00:00 to 2019-01-02+00:03:10"
1414
# the parameters "from" and "until" are needed
1515

16-
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
16+
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
1717
cd "$SCRIPT_DIR"/../../.. || exit 1
1818

1919
PYTHONPATH=. ./unstructured/ingest/main.py \
20-
biomed \
21-
--api-from "2019-01-02" \
22-
--api-until "2019-01-02+00:03:10" \
23-
--output-dir biomed-ingest-output-api \
24-
--num-processes 2 \
25-
--verbose \
26-
--preserve-downloads
27-
20+
biomed \
21+
--api-from "2019-01-02" \
22+
--api-until "2019-01-02+00:03:10" \
23+
--output-dir biomed-ingest-output-api \
24+
--num-processes 2 \
25+
--verbose \
26+
--preserve-downloads
2827

2928
# Alternatively, you can call it using:
3029
# unstructured-ingest --biomed-api ...

examples/ingest/biomed/ingest-with-path.sh

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@
1414
# For example, to download the documents in the path: https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/07/
1515
# The path needed is oa_pdf/07/
1616

17-
18-
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
17+
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
1918
cd "$SCRIPT_DIR"/../../.. || exit 1
2019

2120
# The example below will ingest the PDF from the "oa_pdf/07/07/sbaa031.073.PMC7234218.pdf" path.
@@ -24,12 +23,12 @@ cd "$SCRIPT_DIR"/../../.. || exit 1
2423
# WARNING: There are many documents in that path.
2524

2625
PYTHONPATH=. ./unstructured/ingest/main.py \
27-
biomed \
28-
--path "oa_pdf/07/07/sbaa031.073.PMC7234218.pdf" \
29-
--output-dir biomed-ingest-output-path \
30-
--num-processes 2 \
31-
--verbose \
32-
--preserve-downloads
26+
biomed \
27+
--path "oa_pdf/07/07/sbaa031.073.PMC7234218.pdf" \
28+
--output-dir biomed-ingest-output-path \
29+
--num-processes 2 \
30+
--verbose \
31+
--preserve-downloads
3332

3433
# Alternatively, you can call it using:
3534
# unstructured-ingest --biomed-path ...

examples/ingest/box/ingest.sh

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,14 @@
1818
# and set up the app config.json file here:
1919
# https://developer.box.com/guides/authentication/jwt/with-sdk/
2020

21-
22-
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
21+
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
2322
cd "$SCRIPT_DIR"/../../.. || exit 1
2423

2524
PYTHONPATH=. ./unstructured/ingest/main.py \
26-
box \
27-
--box-app-config "$BOX_APP_CONFIG_PATH" \
28-
--remote-url box://utic-test-ingest-fixtures \
29-
--output-dir box-output \
30-
--num-processes 2 \
31-
--recursive \
32-
--verbose
25+
box \
26+
--box-app-config "$BOX_APP_CONFIG_PATH" \
27+
--remote-url box://utic-test-ingest-fixtures \
28+
--output-dir box-output \
29+
--num-processes 2 \
30+
--recursive \
31+
--verbose

examples/ingest/confluence/ingest.sh

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,10 @@ cd "$SCRIPT_DIR"/../../.. || exit 1
2121
# --max-num-of-docs-from-each-space 250 \
2222
# --> The maximum number of documents to be ingested from each space. Set as 250 in the example.
2323
PYTHONPATH=. ./unstructured/ingest/main.py \
24-
confluence \
25-
--metadata-exclude filename,file_directory,metadata.data_source.date_processed \
26-
--url https://unstructured-ingest-test.atlassian.net \
27-
--user-email [email protected] \
28-
--api-token ABCDE1234ABDE1234ABCDE1234 \
29-
--output-dir confluence-ingest-output \
30-
--num-processes 2
24+
confluence \
25+
--metadata-exclude filename,file_directory,metadata.data_source.date_processed \
26+
--url https://unstructured-ingest-test.atlassian.net \
27+
--user-email [email protected] \
28+
--api-token ABCDE1234ABDE1234ABCDE1234 \
29+
--output-dir confluence-ingest-output \
30+
--num-processes 2

examples/ingest/delta_table/ingest.sh

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,20 @@
66

77
# AWS credentials need to be available for use with the storage options
88
if [ -z "$AWS_ACCESS_KEY_ID" ] && [ -z "$AWS_SECRET_ACCESS_KEY" ]; then
9-
echo "aws credentials not found as env vars"
10-
exit 0
9+
echo "aws credentials not found as env vars"
10+
exit 0
1111
fi
1212

13-
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
13+
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
1414
cd "$SCRIPT_DIR"/../../.. || exit 1
1515

1616
PYTHONPATH=. ./unstructured/ingest/main.py \
17-
delta-table \
18-
--table-uri s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/ \
19-
--output-dir delta-table-output \
20-
--num-processes 2 \
21-
--storage_options "AWS_REGION=us-east-2,AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY" \
22-
--verbose \
23-
delta-table \
24-
--write-column json_data \
25-
--table-uri delta-table-dest
17+
delta-table \
18+
--table-uri s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/ \
19+
--output-dir delta-table-output \
20+
--num-processes 2 \
21+
--storage_options "AWS_REGION=us-east-2,AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY" \
22+
--verbose \
23+
delta-table \
24+
--write-column json_data \
25+
--table-uri delta-table-dest

examples/ingest/discord/ingest.sh

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,14 @@
44

55
# Structured outputs are stored in discord-example/
66

7-
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
7+
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
88
cd "$SCRIPT_DIR"/../../.. || exit 1
99

1010
PYTHONPATH=. ./unstructured/ingest/main.py \
11-
discord \
12-
--channels 12345678 \
13-
--token "$DISCORD_TOKEN" \
14-
--download-dir discord-ingest-download \
15-
--output-dir discord-example \
16-
--preserve-downloads \
17-
--verbose
11+
discord \
12+
--channels 12345678 \
13+
--token "$DISCORD_TOKEN" \
14+
--download-dir discord-ingest-download \
15+
--output-dir discord-example \
16+
--preserve-downloads \
17+
--verbose

examples/ingest/dropbox/ingest.sh

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,14 @@
1010

1111
# Structured outputs are stored in dropbox-output/
1212

13-
14-
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
13+
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
1514
cd "$SCRIPT_DIR"/../../.. || exit 1
1615

17-
1816
PYTHONPATH=. ./unstructured/ingest/main.py \
19-
dropbox \
20-
--remote-url "dropbox:// /" \
21-
--output-dir dropbox-output \
22-
--token "$DROPBOX_TOKEN" \
23-
--num-processes 2 \
24-
--recursive \
25-
--verbose
17+
dropbox \
18+
--remote-url "dropbox:// /" \
19+
--output-dir dropbox-output \
20+
--token "$DROPBOX_TOKEN" \
21+
--num-processes 2 \
22+
--recursive \
23+
--verbose

examples/ingest/elasticsearch/ingest.sh

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,10 @@ wait
1414
trap 'echo "Stopping Elasticsearch Docker container"; docker stop es-test' EXIT
1515

1616
PYTHONPATH=. ./unstructured/ingest/main.py \
17-
elasticsearch \
18-
--metadata-exclude filename,file_directory,metadata.data_source.date_processed \
19-
--url http://localhost:9200 \
20-
--index-name movies \
21-
--jq-query '{ethnicity, director, plot}' \
22-
--output-dir elasticsearch-ingest-output \
23-
--num-processes 2
17+
elasticsearch \
18+
--metadata-exclude filename,file_directory,metadata.data_source.date_processed \
19+
--url http://localhost:9200 \
20+
--index-name movies \
21+
--jq-query '{ethnicity, director, plot}' \
22+
--output-dir elasticsearch-ingest-output \
23+
--num-processes 2

examples/ingest/github/ingest.sh

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,16 @@
55

66
# Structured outputs are stored in github-ingest-output/
77

8-
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
8+
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
99
cd "$SCRIPT_DIR"/../../.. || exit 1
1010

1111
PYTHONPATH=. ./unstructured/ingest/main.py \
12-
github \
13-
--url Unstructured-IO/unstructured \
14-
--git-branch main \
15-
--output-dir github-ingest-output \
16-
--num-processes 2 \
17-
--verbose
12+
github \
13+
--url Unstructured-IO/unstructured \
14+
--git-branch main \
15+
--output-dir github-ingest-output \
16+
--num-processes 2 \
17+
--verbose
1818

1919
# Alternatively, you can call it using:
2020
# unstructured-ingest github --url ...

examples/ingest/gitlab/ingest.sh

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,16 @@
55

66
# Structured outputs are stored in gitlab-ingest-output/
77

8-
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
8+
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
99
cd "$SCRIPT_DIR"/../../.. || exit 1
1010

1111
PYTHONPATH=. ./unstructured/ingest/main.py \
12-
gitlab \
13-
--url https://gitlab.com/gitlab-com/content-sites/docsy-gitlab \
14-
--git-branch 'v0.0.7' \
15-
--output-dir gitlab-ingest-output \
16-
--num-processes 2 \
17-
--verbose
12+
gitlab \
13+
--url https://gitlab.com/gitlab-com/content-sites/docsy-gitlab \
14+
--git-branch 'v0.0.7' \
15+
--output-dir gitlab-ingest-output \
16+
--num-processes 2 \
17+
--verbose
1818

1919
# Alternatively, you can call it using:
2020
# unstructured-ingest gitlab --gitlab-url ...

examples/ingest/google_cloud_storage/ingest.sh

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@
55

66
# Structured outputs are stored in gcs-output/
77

8-
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
8+
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
99
cd "$SCRIPT_DIR"/../../.. || exit 1
1010

1111
PYTHONPATH=. ./unstructured/ingest/main.py \
12-
gcs \
13-
--remote-url gs://utic-test-ingest-fixtures-public/ \
14-
--output-dir gcs-output \
15-
--num-processes 2 \
16-
--recursive \
17-
--verbose
12+
gcs \
13+
--remote-url gs://utic-test-ingest-fixtures-public/ \
14+
--output-dir gcs-output \
15+
--num-processes 2 \
16+
--recursive \
17+
--verbose

0 commit comments

Comments
 (0)