From 5517e94319cdb8cf017a9cd28b1f3b77cb1c903d Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Mon, 1 May 2023 09:43:30 -0700 Subject: [PATCH] [automation] transform lesson to sandpaper --- .editorconfig | 26 ++ .github/workflows/README.md | 198 +++++++++++++ .github/workflows/pr-close-signal.yaml | 23 ++ .github/workflows/pr-comment.yaml | 185 ++++++++++++ .github/workflows/pr-post-remove-branch.yaml | 32 ++ .github/workflows/pr-preflight.yaml | 39 +++ .github/workflows/pr-receive.yaml | 131 +++++++++ .github/workflows/sandpaper-main.yaml | 61 ++++ .github/workflows/sandpaper-version.txt | 1 + .github/workflows/update-cache.yaml | 125 ++++++++ .github/workflows/update-workflows.yaml | 66 +++++ .github/workflows/workbench-beta-phase.yml | 60 ++++ .gitignore | 55 ++++ CODE_OF_CONDUCT.md | 13 + CONTRIBUTING.md | 121 ++++++++ LICENSE.md | 79 +++++ README.md | 12 +- config.yaml | 94 ++++++ episodes/01-introduction.md | 118 ++++---- episodes/02-importing-data.md | 104 ++++--- episodes/03-working-with-data.md | 178 ++++++----- episodes/04-faceting-and-filtering.md | 189 +++++++----- episodes/05-clustering.md | 57 ++-- episodes/06-working-with-columns.md | 50 ++-- .../07-introduction-to-transformations.md | 72 +++-- episodes/08-writing-transformations.md | 78 +++-- episodes/09-undo-and-redo.md | 37 ++- episodes/10-data-transformation.md | 180 +++++++----- episodes/11-using-arrays-transformations.md | 124 +++++--- episodes/12-export-transformation.md | 37 ++- episodes/13-looking-up-data.md | 276 ++++++++++-------- {data => episodes/data}/UCSD_Guardian.csv | 0 .../data}/doaj-article-sample.csv | 0 .../data}/petitions-archive-list.xlsx | Bin {data => episodes/data}/solar-patents.csv | 0 .../files}/draft-instructor-notes.md | 0 index.md | 28 +- .../instructor-notes.md | 44 +-- {_extras => learners}/discuss.md | 4 +- learners/reference.md | 9 + setup.md => learners/setup.md | 76 ++--- profiles/learner-profiles.md | 5 + reference.md | 4 - site/README.md | 2 + 44 files changed, 2337 insertions(+), 656 deletions(-) create mode 100644 .editorconfig create mode 100755 .github/workflows/README.md create mode 100755 .github/workflows/pr-close-signal.yaml create mode 100755 .github/workflows/pr-comment.yaml create mode 100755 .github/workflows/pr-post-remove-branch.yaml create mode 100755 .github/workflows/pr-preflight.yaml create mode 100755 .github/workflows/pr-receive.yaml create mode 100755 .github/workflows/sandpaper-main.yaml create mode 100644 .github/workflows/sandpaper-version.txt create mode 100755 .github/workflows/update-cache.yaml create mode 100755 .github/workflows/update-workflows.yaml create mode 100644 .github/workflows/workbench-beta-phase.yml create mode 100644 .gitignore create mode 100644 CODE_OF_CONDUCT.md create mode 100644 CONTRIBUTING.md create mode 100644 LICENSE.md create mode 100644 config.yaml rename {data => episodes/data}/UCSD_Guardian.csv (100%) rename {data => episodes/data}/doaj-article-sample.csv (100%) rename {data => episodes/data}/petitions-archive-list.xlsx (100%) rename {data => episodes/data}/solar-patents.csv (100%) rename {files => episodes/files}/draft-instructor-notes.md (100%) rename _extras/guide.md => instructors/instructor-notes.md (54%) rename {_extras => learners}/discuss.md (98%) create mode 100644 learners/reference.md rename setup.md => learners/setup.md (50%) create mode 100644 profiles/learner-profiles.md delete mode 100644 reference.md create mode 100644 site/README.md diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 00000000..5bf4860b --- /dev/null +++ b/.editorconfig @@ -0,0 +1,26 @@ +root = true + +[*] +charset = utf-8 +insert_final_newline = true +trim_trailing_whitespace = true + +[*.md] +indent_size = 2 +indent_style = space +max_line_length = 100 # Please keep this in sync with bin/lesson_check.py! +trim_trailing_whitespace = false # keep trailing spaces in markdown - 2+ spaces are translated to a hard break (
) + +[*.r] +max_line_length = 80 + +[*.py] +indent_size = 4 +indent_style = space +max_line_length = 79 + +[*.sh] +end_of_line = lf + +[Makefile] +indent_style = tab diff --git a/.github/workflows/README.md b/.github/workflows/README.md new file mode 100755 index 00000000..101967e4 --- /dev/null +++ b/.github/workflows/README.md @@ -0,0 +1,198 @@ +# Carpentries Workflows + +This directory contains workflows to be used for Lessons using the {sandpaper} +lesson infrastructure. Two of these workflows require R (`sandpaper-main.yaml` +and `pr-recieve.yaml`) and the rest are bots to handle pull request management. + +These workflows will likely change as {sandpaper} evolves, so it is important to +keep them up-to-date. To do this in your lesson you can do the following in your +R console: + +```r +# Install/Update sandpaper +options(repos = c(carpentries = "https://carpentries.r-universe.dev/", + CRAN = "https://cloud.r-project.org")) +install.packages("sandpaper") + +# update the workflows in your lesson +library("sandpaper") +update_github_workflows() +``` + +Inside this folder, you will find a file called `sandpaper-version.txt`, which +will contain a version number for sandpaper. This will be used in the future to +alert you if a workflow update is needed. + +What follows are the descriptions of the workflow files: + +## Deployment + +### 01 Build and Deploy (sandpaper-main.yaml) + +This is the main driver that will only act on the main branch of the repository. +This workflow does the following: + + 1. checks out the lesson + 2. provisions the following resources + - R + - pandoc + - lesson infrastructure (stored in a cache) + - lesson dependencies if needed (stored in a cache) + 3. builds the lesson via `sandpaper:::ci_deploy()` + +#### Caching + +This workflow has two caches; one cache is for the lesson infrastructure and +the other is for the the lesson dependencies if the lesson contains rendered +content. These caches are invalidated by new versions of the infrastructure and +the `renv.lock` file, respectively. If there is a problem with the cache, +manual invaliation is necessary. You will need maintain access to the repository +and you can either go to the actions tab and [click on the caches button to find +and invalidate the failing cache](https://github.blog/changelog/2022-10-20-manage-caches-in-your-actions-workflows-from-web-interface/) +or by setting the `CACHE_VERSION` secret to the current date (which will +invalidate all of the caches). + +## Updates + +### Setup Information + +These workflows run on a schedule and at the maintainer's request. Because they +create pull requests that update workflows/require the downstream actions to run, +they need a special repository/organization secret token called +`SANDPAPER_WORKFLOW` and it must have the `public_repo` and `workflow` scope. + +This can be an individual user token, OR it can be a trusted bot account. If you +have a repository in one of the official Carpentries accounts, then you do not +need to worry about this token being present because the Carpentries Core Team +will take care of supplying this token. + +If you want to use your personal account: you can go to + +to create a token. Once you have created your token, you should copy it to your +clipboard and then go to your repository's settings > secrets > actions and +create or edit the `SANDPAPER_WORKFLOW` secret, pasting in the generated token. + +If you do not specify your token correctly, the runs will not fail and they will +give you instructions to provide the token for your repository. + +### 02 Maintain: Update Workflow Files (update-workflow.yaml) + +The {sandpaper} repository was designed to do as much as possible to separate +the tools from the content. For local builds, this is absolutely true, but +there is a minor issue when it comes to workflow files: they must live inside +the repository. + +This workflow ensures that the workflow files are up-to-date. The way it work is +to download the update-workflows.sh script from GitHub and run it. The script +will do the following: + +1. check the recorded version of sandpaper against the current version on github +2. update the files if there is a difference in versions + +After the files are updated, if there are any changes, they are pushed to a +branch called `update/workflows` and a pull request is created. Maintainers are +encouraged to review the changes and accept the pull request if the outputs +are okay. + +This update is run ~~weekly or~~ on demand. + +### 03 Maintain: Update Pacakge Cache (update-cache.yaml) + +For lessons that have generated content, we use {renv} to ensure that the output +is stable. This is controlled by a single lockfile which documents the packages +needed for the lesson and the version numbers. This workflow is skipped in +lessons that do not have generated content. + +Because the lessons need to remain current with the package ecosystem, it's a +good idea to make sure these packages can be updated periodically. The +update cache workflow will do this by checking for updates, applying them in a +branch called `updates/packages` and creating a pull request with _only the +lockfile changed_. + +From here, the markdown documents will be rebuilt and you can inspect what has +changed based on how the packages have updated. + +## Pull Request and Review Management + +Because our lessons execute code, pull requests are a secruity risk for any +lesson and thus have security measures associted with them. **Do not merge any +pull requests that do not pass checks and do not have bots commented on them.** + +This series of workflows all go together and are described in the following +diagram and the below sections: + +![Graph representation of a pull request](https://carpentries.github.io/sandpaper/articles/img/pr-flow.dot.svg) + +### Pre Flight Pull Request Validation (pr-preflight.yaml) + +This workflow runs every time a pull request is created and its purpose is to +validate that the pull request is okay to run. This means the following things: + +1. The pull request does not contain modified workflow files +2. If the pull request contains modified workflow files, it does not contain + modified content files (such as a situation where @carpentries-bot will + make an automated pull request) +3. The pull request does not contain an invalid commit hash (e.g. from a fork + that was made before a lesson was transitioned from styles to use the + workbench). + +Once the checks are finished, a comment is issued to the pull request, which +will allow maintainers to determine if it is safe to run the +"Receive Pull Request" workflow from new contributors. + +### Recieve Pull Request (pr-recieve.yaml) + +**Note of caution:** This workflow runs arbitrary code by anyone who creates a +pull request. GitHub has safeguarded the token used in this workflow to have no +priviledges in the repository, but we have taken precautions to protect against +spoofing. + +This workflow is triggered with every push to a pull request. If this workflow +is already running and a new push is sent to the pull request, the workflow +running from the previous push will be cancelled and a new workflow run will be +started. + +The first step of this workflow is to check if it is valid (e.g. that no +workflow files have been modified). If there are workflow files that have been +modified, a comment is made that indicates that the workflow is not run. If +both a workflow file and lesson content is modified, an error will occurr. + +The second step (if valid) is to build the generated content from the pull +request. This builds the content and uploads three artifacts: + +1. The pull request number (pr) +2. A summary of changes after the rendering process (diff) +3. The rendered files (build) + +Because this workflow builds generated content, it follows the same general +process as the `sandpaper-main` workflow with the same caching mechanisms. + +The artifacts produced are used by the next workflow. + +### Comment on Pull Request (pr-comment.yaml) + +This workflow is triggered if the `pr-recieve.yaml` workflow is successful. +The steps in this workflow are: + +1. Test if the workflow is valid and comment the validity of the workflow to the + pull request. +2. If it is valid: create an orphan branch with two commits: the current state + of the repository and the proposed changes. +3. If it is valid: update the pull request comment with the summary of changes + +Importantly: if the pull request is invalid, the branch is not created so any +malicious code is not published. + +From here, the maintainer can request changes from the author and eventually +either merge or reject the PR. When this happens, if the PR was valid, the +preview branch needs to be deleted. + +### Send Close PR Signal (pr-close-signal.yaml) + +Triggered any time a pull request is closed. This emits an artifact that is the +pull request number for the next action + +### Remove Pull Request Branch (pr-post-remove-branch.yaml) + +Tiggered by `pr-close-signal.yaml`. This removes the temporary branch associated with +the pull request (if it was created). diff --git a/.github/workflows/pr-close-signal.yaml b/.github/workflows/pr-close-signal.yaml new file mode 100755 index 00000000..9b129d5d --- /dev/null +++ b/.github/workflows/pr-close-signal.yaml @@ -0,0 +1,23 @@ +name: "Bot: Send Close Pull Request Signal" + +on: + pull_request: + types: + [closed] + +jobs: + send-close-signal: + name: "Send closing signal" + runs-on: ubuntu-latest + if: ${{ github.event.action == 'closed' }} + steps: + - name: "Create PRtifact" + run: | + mkdir -p ./pr + printf ${{ github.event.number }} > ./pr/NUM + - name: Upload Diff + uses: actions/upload-artifact@v3 + with: + name: pr + path: ./pr + diff --git a/.github/workflows/pr-comment.yaml b/.github/workflows/pr-comment.yaml new file mode 100755 index 00000000..bb2eb03c --- /dev/null +++ b/.github/workflows/pr-comment.yaml @@ -0,0 +1,185 @@ +name: "Bot: Comment on the Pull Request" + +# read-write repo token +# access to secrets +on: + workflow_run: + workflows: ["Receive Pull Request"] + types: + - completed + +concurrency: + group: pr-${{ github.event.workflow_run.pull_requests[0].number }} + cancel-in-progress: true + + +jobs: + # Pull requests are valid if: + # - they match the sha of the workflow run head commit + # - they are open + # - no .github files were committed + test-pr: + name: "Test if pull request is valid" + runs-on: ubuntu-latest + if: > + github.event.workflow_run.event == 'pull_request' && + github.event.workflow_run.conclusion == 'success' + outputs: + is_valid: ${{ steps.check-pr.outputs.VALID }} + payload: ${{ steps.check-pr.outputs.payload }} + number: ${{ steps.get-pr.outputs.NUM }} + msg: ${{ steps.check-pr.outputs.MSG }} + steps: + - name: 'Download PR artifact' + id: dl + uses: carpentries/actions/download-workflow-artifact@main + with: + run: ${{ github.event.workflow_run.id }} + name: 'pr' + + - name: "Get PR Number" + if: ${{ steps.dl.outputs.success == 'true' }} + id: get-pr + run: | + unzip pr.zip + echo "NUM=$(<./NR)" >> $GITHUB_OUTPUT + + - name: "Fail if PR number was not present" + id: bad-pr + if: ${{ steps.dl.outputs.success != 'true' }} + run: | + echo '::error::A pull request number was not recorded. The pull request that triggered this workflow is likely malicious.' + exit 1 + - name: "Get Invalid Hashes File" + id: hash + run: | + echo "json<> $GITHUB_OUTPUT + - name: "Check PR" + id: check-pr + if: ${{ steps.dl.outputs.success == 'true' }} + uses: carpentries/actions/check-valid-pr@main + with: + pr: ${{ steps.get-pr.outputs.NUM }} + sha: ${{ github.event.workflow_run.head_sha }} + headroom: 3 # if it's within the last three commits, we can keep going, because it's likely rapid-fire + invalid: ${{ fromJSON(steps.hash.outputs.json)[github.repository] }} + fail_on_error: true + + # Create an orphan branch on this repository with two commits + # - the current HEAD of the md-outputs branch + # - the output from running the current HEAD of the pull request through + # the md generator + create-branch: + name: "Create Git Branch" + needs: test-pr + runs-on: ubuntu-latest + if: ${{ needs.test-pr.outputs.is_valid == 'true' }} + env: + NR: ${{ needs.test-pr.outputs.number }} + permissions: + contents: write + steps: + - name: 'Checkout md outputs' + uses: actions/checkout@v3 + with: + ref: md-outputs + path: built + fetch-depth: 1 + + - name: 'Download built markdown' + id: dl + uses: carpentries/actions/download-workflow-artifact@main + with: + run: ${{ github.event.workflow_run.id }} + name: 'built' + + - if: ${{ steps.dl.outputs.success == 'true' }} + run: unzip built.zip + + - name: "Create orphan and push" + if: ${{ steps.dl.outputs.success == 'true' }} + run: | + cd built/ + git config --local user.email "actions@github.com" + git config --local user.name "GitHub Actions" + CURR_HEAD=$(git rev-parse HEAD) + git checkout --orphan md-outputs-PR-${NR} + git add -A + git commit -m "source commit: ${CURR_HEAD}" + ls -A | grep -v '^.git$' | xargs -I _ rm -r '_' + cd .. + unzip -o -d built built.zip + cd built + git add -A + git commit --allow-empty -m "differences for PR #${NR}" + git push -u --force --set-upstream origin md-outputs-PR-${NR} + + # Comment on the Pull Request with a link to the branch and the diff + comment-pr: + name: "Comment on Pull Request" + needs: [test-pr, create-branch] + runs-on: ubuntu-latest + if: ${{ needs.test-pr.outputs.is_valid == 'true' }} + env: + NR: ${{ needs.test-pr.outputs.number }} + permissions: + pull-requests: write + steps: + - name: 'Download comment artifact' + id: dl + uses: carpentries/actions/download-workflow-artifact@main + with: + run: ${{ github.event.workflow_run.id }} + name: 'diff' + + - if: ${{ steps.dl.outputs.success == 'true' }} + run: unzip ${{ github.workspace }}/diff.zip + + - name: "Comment on PR" + id: comment-diff + if: ${{ steps.dl.outputs.success == 'true' }} + uses: carpentries/actions/comment-diff@main + with: + pr: ${{ env.NR }} + path: ${{ github.workspace }}/diff.md + + # Comment if the PR is open and matches the SHA, but the workflow files have + # changed + comment-changed-workflow: + name: "Comment if workflow files have changed" + needs: test-pr + runs-on: ubuntu-latest + if: ${{ always() && needs.test-pr.outputs.is_valid == 'false' }} + env: + NR: ${{ github.event.workflow_run.pull_requests[0].number }} + body: ${{ needs.test-pr.outputs.msg }} + permissions: + pull-requests: write + steps: + - name: 'Check for spoofing' + id: dl + uses: carpentries/actions/download-workflow-artifact@main + with: + run: ${{ github.event.workflow_run.id }} + name: 'built' + + - name: 'Alert if spoofed' + id: spoof + if: ${{ steps.dl.outputs.success == 'true' }} + run: | + echo 'body<> $GITHUB_ENV + echo '' >> $GITHUB_ENV + echo '## :x: DANGER :x:' >> $GITHUB_ENV + echo 'This pull request has modified workflows that created output. Close this now.' >> $GITHUB_ENV + echo '' >> $GITHUB_ENV + echo 'EOF' >> $GITHUB_ENV + + - name: "Comment on PR" + id: comment-diff + uses: carpentries/actions/comment-diff@main + with: + pr: ${{ env.NR }} + body: ${{ env.body }} + diff --git a/.github/workflows/pr-post-remove-branch.yaml b/.github/workflows/pr-post-remove-branch.yaml new file mode 100755 index 00000000..62c2e98d --- /dev/null +++ b/.github/workflows/pr-post-remove-branch.yaml @@ -0,0 +1,32 @@ +name: "Bot: Remove Temporary PR Branch" + +on: + workflow_run: + workflows: ["Bot: Send Close Pull Request Signal"] + types: + - completed + +jobs: + delete: + name: "Delete branch from Pull Request" + runs-on: ubuntu-latest + if: > + github.event.workflow_run.event == 'pull_request' && + github.event.workflow_run.conclusion == 'success' + permissions: + contents: write + steps: + - name: 'Download artifact' + uses: carpentries/actions/download-workflow-artifact@main + with: + run: ${{ github.event.workflow_run.id }} + name: pr + - name: "Get PR Number" + id: get-pr + run: | + unzip pr.zip + echo "NUM=$(<./NUM)" >> $GITHUB_OUTPUT + - name: 'Remove branch' + uses: carpentries/actions/remove-branch@main + with: + pr: ${{ steps.get-pr.outputs.NUM }} diff --git a/.github/workflows/pr-preflight.yaml b/.github/workflows/pr-preflight.yaml new file mode 100755 index 00000000..d0d7420d --- /dev/null +++ b/.github/workflows/pr-preflight.yaml @@ -0,0 +1,39 @@ +name: "Pull Request Preflight Check" + +on: + pull_request_target: + branches: + ["main"] + types: + ["opened", "synchronize", "reopened"] + +jobs: + test-pr: + name: "Test if pull request is valid" + if: ${{ github.event.action != 'closed' }} + runs-on: ubuntu-latest + outputs: + is_valid: ${{ steps.check-pr.outputs.VALID }} + permissions: + pull-requests: write + steps: + - name: "Get Invalid Hashes File" + id: hash + run: | + echo "json<> $GITHUB_OUTPUT + - name: "Check PR" + id: check-pr + uses: carpentries/actions/check-valid-pr@main + with: + pr: ${{ github.event.number }} + invalid: ${{ fromJSON(steps.hash.outputs.json)[github.repository] }} + fail_on_error: true + - name: "Comment result of validation" + id: comment-diff + if: ${{ always() }} + uses: carpentries/actions/comment-diff@main + with: + pr: ${{ github.event.number }} + body: ${{ steps.check-pr.outputs.MSG }} diff --git a/.github/workflows/pr-receive.yaml b/.github/workflows/pr-receive.yaml new file mode 100755 index 00000000..371ef542 --- /dev/null +++ b/.github/workflows/pr-receive.yaml @@ -0,0 +1,131 @@ +name: "Receive Pull Request" + +on: + pull_request: + types: + [opened, synchronize, reopened] + +concurrency: + group: ${{ github.ref }} + cancel-in-progress: true + +jobs: + test-pr: + name: "Record PR number" + if: ${{ github.event.action != 'closed' }} + runs-on: ubuntu-latest + outputs: + is_valid: ${{ steps.check-pr.outputs.VALID }} + steps: + - name: "Record PR number" + id: record + if: ${{ always() }} + run: | + echo ${{ github.event.number }} > ${{ github.workspace }}/NR # 2022-03-02: artifact name fixed to be NR + - name: "Upload PR number" + id: upload + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: pr + path: ${{ github.workspace }}/NR + - name: "Get Invalid Hashes File" + id: hash + run: | + echo "json<> $GITHUB_OUTPUT + - name: "echo output" + run: | + echo "${{ steps.hash.outputs.json }}" + - name: "Check PR" + id: check-pr + uses: carpentries/actions/check-valid-pr@main + with: + pr: ${{ github.event.number }} + invalid: ${{ fromJSON(steps.hash.outputs.json)[github.repository] }} + + build-md-source: + name: "Build markdown source files if valid" + needs: test-pr + runs-on: ubuntu-latest + if: ${{ needs.test-pr.outputs.is_valid == 'true' }} + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + RENV_PATHS_ROOT: ~/.local/share/renv/ + CHIVE: ${{ github.workspace }}/site/chive + PR: ${{ github.workspace }}/site/pr + MD: ${{ github.workspace }}/site/built + steps: + - name: "Check Out Main Branch" + uses: actions/checkout@v3 + + - name: "Check Out Staging Branch" + uses: actions/checkout@v3 + with: + ref: md-outputs + path: ${{ env.MD }} + + - name: "Set up R" + uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + install-r: false + + - name: "Set up Pandoc" + uses: r-lib/actions/setup-pandoc@v2 + + - name: "Setup Lesson Engine" + uses: carpentries/actions/setup-sandpaper@main + with: + cache-version: ${{ secrets.CACHE_VERSION }} + + - name: "Setup Package Cache" + uses: carpentries/actions/setup-lesson-deps@main + with: + cache-version: ${{ secrets.CACHE_VERSION }} + + - name: "Validate and Build Markdown" + id: build-site + run: | + sandpaper::package_cache_trigger(TRUE) + sandpaper::validate_lesson(path = '${{ github.workspace }}') + sandpaper:::build_markdown(path = '${{ github.workspace }}', quiet = FALSE) + shell: Rscript {0} + + - name: "Generate Artifacts" + id: generate-artifacts + run: | + sandpaper:::ci_bundle_pr_artifacts( + repo = '${{ github.repository }}', + pr_number = '${{ github.event.number }}', + path_md = '${{ env.MD }}', + path_pr = '${{ env.PR }}', + path_archive = '${{ env.CHIVE }}', + branch = 'md-outputs' + ) + shell: Rscript {0} + + - name: "Upload PR" + uses: actions/upload-artifact@v3 + with: + name: pr + path: ${{ env.PR }} + + - name: "Upload Diff" + uses: actions/upload-artifact@v3 + with: + name: diff + path: ${{ env.CHIVE }} + retention-days: 1 + + - name: "Upload Build" + uses: actions/upload-artifact@v3 + with: + name: built + path: ${{ env.MD }} + retention-days: 1 + + - name: "Teardown" + run: sandpaper::reset_site() + shell: Rscript {0} diff --git a/.github/workflows/sandpaper-main.yaml b/.github/workflows/sandpaper-main.yaml new file mode 100755 index 00000000..e17707ac --- /dev/null +++ b/.github/workflows/sandpaper-main.yaml @@ -0,0 +1,61 @@ +name: "01 Build and Deploy Site" + +on: + push: + branches: + - main + - master + schedule: + - cron: '0 0 * * 2' + workflow_dispatch: + inputs: + name: + description: 'Who triggered this build?' + required: true + default: 'Maintainer (via GitHub)' + reset: + description: 'Reset cached markdown files' + required: false + default: false + type: boolean +jobs: + full-build: + name: "Build Full Site" + runs-on: ubuntu-latest + permissions: + checks: write + contents: write + pages: write + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + RENV_PATHS_ROOT: ~/.local/share/renv/ + steps: + + - name: "Checkout Lesson" + uses: actions/checkout@v3 + + - name: "Set up R" + uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + install-r: false + + - name: "Set up Pandoc" + uses: r-lib/actions/setup-pandoc@v2 + + - name: "Setup Lesson Engine" + uses: carpentries/actions/setup-sandpaper@main + with: + cache-version: ${{ secrets.CACHE_VERSION }} + + - name: "Setup Package Cache" + uses: carpentries/actions/setup-lesson-deps@main + with: + cache-version: ${{ secrets.CACHE_VERSION }} + + - name: "Deploy Site" + run: | + reset <- "${{ github.event.inputs.reset }}" == "true" + sandpaper::package_cache_trigger(TRUE) + sandpaper:::ci_deploy(reset = reset) + shell: Rscript {0} diff --git a/.github/workflows/sandpaper-version.txt b/.github/workflows/sandpaper-version.txt new file mode 100644 index 00000000..4aa09069 --- /dev/null +++ b/.github/workflows/sandpaper-version.txt @@ -0,0 +1 @@ +0.11.15 diff --git a/.github/workflows/update-cache.yaml b/.github/workflows/update-cache.yaml new file mode 100755 index 00000000..676d7424 --- /dev/null +++ b/.github/workflows/update-cache.yaml @@ -0,0 +1,125 @@ +name: "03 Maintain: Update Package Cache" + +on: + workflow_dispatch: + inputs: + name: + description: 'Who triggered this build (enter github username to tag yourself)?' + required: true + default: 'monthly run' + schedule: + # Run every tuesday + - cron: '0 0 * * 2' + +jobs: + preflight: + name: "Preflight Check" + runs-on: ubuntu-latest + outputs: + ok: ${{ steps.check.outputs.ok }} + steps: + - id: check + run: | + if [[ ${{ github.event_name }} == 'workflow_dispatch' ]]; then + echo "ok=true" >> $GITHUB_OUTPUT + echo "Running on request" + # using single brackets here to avoid 08 being interpreted as octal + # https://github.com/carpentries/sandpaper/issues/250 + elif [ `date +%d` -le 7 ]; then + # If the Tuesday lands in the first week of the month, run it + echo "ok=true" >> $GITHUB_OUTPUT + echo "Running on schedule" + else + echo "ok=false" >> $GITHUB_OUTPUT + echo "Not Running Today" + fi + + check_renv: + name: "Check if We Need {renv}" + runs-on: ubuntu-latest + needs: preflight + if: ${{ needs.preflight.outputs.ok == 'true'}} + outputs: + needed: ${{ steps.renv.outputs.exists }} + steps: + - name: "Checkout Lesson" + uses: actions/checkout@v3 + - id: renv + run: | + if [[ -d renv ]]; then + echo "exists=true" >> $GITHUB_OUTPUT + fi + + check_token: + name: "Check SANDPAPER_WORKFLOW token" + runs-on: ubuntu-latest + needs: check_renv + if: ${{ needs.check_renv.outputs.needed == 'true' }} + outputs: + workflow: ${{ steps.validate.outputs.wf }} + repo: ${{ steps.validate.outputs.repo }} + steps: + - name: "validate token" + id: validate + uses: carpentries/actions/check-valid-credentials@main + with: + token: ${{ secrets.SANDPAPER_WORKFLOW }} + + update_cache: + name: "Update Package Cache" + needs: check_token + if: ${{ needs.check_token.outputs.repo== 'true' }} + runs-on: ubuntu-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + RENV_PATHS_ROOT: ~/.local/share/renv/ + steps: + + - name: "Checkout Lesson" + uses: actions/checkout@v3 + + - name: "Set up R" + uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + install-r: false + + - name: "Update {renv} deps and determine if a PR is needed" + id: update + uses: carpentries/actions/update-lockfile@main + with: + cache-version: ${{ secrets.CACHE_VERSION }} + + - name: Create Pull Request + id: cpr + if: ${{ steps.update.outputs.n > 0 }} + uses: carpentries/create-pull-request@main + with: + token: ${{ secrets.SANDPAPER_WORKFLOW }} + delete-branch: true + branch: "update/packages" + commit-message: "[actions] update ${{ steps.update.outputs.n }} packages" + title: "Update ${{ steps.update.outputs.n }} packages" + body: | + :robot: This is an automated build + + This will update ${{ steps.update.outputs.n }} packages in your lesson with the following versions: + + ``` + ${{ steps.update.outputs.report }} + ``` + + :stopwatch: In a few minutes, a comment will appear that will show you how the output has changed based on these updates. + + If you want to inspect these changes locally, you can use the following code to check out a new branch: + + ```bash + git fetch origin update/packages + git checkout update/packages + ``` + + - Auto-generated by [create-pull-request][1] on ${{ steps.update.outputs.date }} + + [1]: https://github.com/carpentries/create-pull-request/tree/main + labels: "type: package cache" + draft: false diff --git a/.github/workflows/update-workflows.yaml b/.github/workflows/update-workflows.yaml new file mode 100755 index 00000000..288bcd13 --- /dev/null +++ b/.github/workflows/update-workflows.yaml @@ -0,0 +1,66 @@ +name: "02 Maintain: Update Workflow Files" + +on: + workflow_dispatch: + inputs: + name: + description: 'Who triggered this build (enter github username to tag yourself)?' + required: true + default: 'weekly run' + clean: + description: 'Workflow files/file extensions to clean (no wildcards, enter "" for none)' + required: false + default: '.yaml' + schedule: + # Run every Tuesday + - cron: '0 0 * * 2' + +jobs: + check_token: + name: "Check SANDPAPER_WORKFLOW token" + runs-on: ubuntu-latest + outputs: + workflow: ${{ steps.validate.outputs.wf }} + repo: ${{ steps.validate.outputs.repo }} + steps: + - name: "validate token" + id: validate + uses: carpentries/actions/check-valid-credentials@main + with: + token: ${{ secrets.SANDPAPER_WORKFLOW }} + + update_workflow: + name: "Update Workflow" + runs-on: ubuntu-latest + needs: check_token + if: ${{ needs.check_token.outputs.workflow == 'true' }} + steps: + - name: "Checkout Repository" + uses: actions/checkout@v3 + + - name: Update Workflows + id: update + uses: carpentries/actions/update-workflows@main + with: + clean: ${{ github.event.inputs.clean }} + + - name: Create Pull Request + id: cpr + if: "${{ steps.update.outputs.new }}" + uses: carpentries/create-pull-request@main + with: + token: ${{ secrets.SANDPAPER_WORKFLOW }} + delete-branch: true + branch: "update/workflows" + commit-message: "[actions] update sandpaper workflow to version ${{ steps.update.outputs.new }}" + title: "Update Workflows to Version ${{ steps.update.outputs.new }}" + body: | + :robot: This is an automated build + + Update Workflows from sandpaper version ${{ steps.update.outputs.old }} -> ${{ steps.update.outputs.new }} + + - Auto-generated by [create-pull-request][1] on ${{ steps.update.outputs.date }} + + [1]: https://github.com/carpentries/create-pull-request/tree/main + labels: "type: template and tools" + draft: false diff --git a/.github/workflows/workbench-beta-phase.yml b/.github/workflows/workbench-beta-phase.yml new file mode 100644 index 00000000..2faa25d9 --- /dev/null +++ b/.github/workflows/workbench-beta-phase.yml @@ -0,0 +1,60 @@ +name: "Deploy to AWS" + +on: + workflow_run: + workflows: ["01 Build and Deploy Site"] + types: + - completed + workflow_dispatch: + +jobs: + preflight: + name: "Preflight Check" + runs-on: ubuntu-latest + outputs: + ok: ${{ steps.check.outputs.ok }} + folder: ${{ steps.check.outputs.folder }} + steps: + - id: check + run: | + if [[ -z "${{ secrets.DISTRIBUTION }}" || -z "${{ secrets.AWS_ACCESS_KEY_ID }}" || -z "${{ secrets.AWS_SECRET_ACCESS_KEY }}" ]]; then + echo ":information_source: No site configured" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo 'To deploy the preview on AWS, you need the `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY` and `DISTRIBUTION` secrets set up' >> $GITHUB_STEP_SUMMARY + else + echo "::set-output name=folder::"$(sed -E 's^.+/(.+)^\1^' <<< ${{ github.repository }}) + echo "::set-output name=ok::true" + fi + + full-build: + name: "Deploy to AWS" + needs: [preflight] + if: ${{ needs.preflight.outputs.ok }} + runs-on: ubuntu-latest + steps: + + - name: "Checkout site folder" + uses: actions/checkout@v3 + with: + ref: 'gh-pages' + path: 'source' + + - name: "Deploy to Bucket" + uses: jakejarvis/s3-sync-action@v0.5.1 + with: + args: --acl public-read --follow-symlinks --delete --exclude '.git/*' + env: + AWS_S3_BUCKET: preview.carpentries.org + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + SOURCE_DIR: 'source' + DEST_DIR: ${{ needs.preflight.outputs.folder }} + + - name: "Invalidate CloudFront" + uses: chetan/invalidate-cloudfront-action@master + env: + PATHS: /* + AWS_REGION: 'us-east-1' + DISTRIBUTION: ${{ secrets.DISTRIBUTION }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..b8ab7062 --- /dev/null +++ b/.gitignore @@ -0,0 +1,55 @@ +# sandpaper files +episodes/*html +site/* +!site/README.md + +# History files +.Rhistory +.Rapp.history +# Session Data files +.RData +# User-specific files +.Ruserdata +# Example code in package build process +*-Ex.R +# Output files from R CMD build +/*.tar.gz +# Output files from R CMD check +/*.Rcheck/ +# RStudio files +.Rproj.user/ +# produced vignettes +vignettes/*.html +vignettes/*.pdf +# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 +.httr-oauth +# knitr and R markdown default cache directories +*_cache/ +/cache/ +# Temporary files created by R markdown +*.utf8.md +*.knit.md +# R Environment Variables +.Renviron +# pkgdown site +docs/ +# translation temp files +po/*~ +# renv detritus +renv/sandbox/ +*.pyc +*~ +.DS_Store +.ipynb_checkpoints +.sass-cache +.jekyll-cache/ +.jekyll-metadata +__pycache__ +_site +.Rproj.user +.bundle/ +.vendor/ +vendor/ +.docker-vendor/ +Gemfile.lock +.*history diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..f19b8049 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,13 @@ +--- +title: "Contributor Code of Conduct" +--- + +As contributors and maintainers of this project, +we pledge to follow the [The Carpentries Code of Conduct][coc]. + +Instances of abusive, harassing, or otherwise unacceptable behavior +may be reported by following our [reporting guidelines][coc-reporting]. + + +[coc-reporting]: https://docs.carpentries.org/topic_folders/policies/incident-reporting.html +[coc]: https://docs.carpentries.org/topic_folders/policies/code-of-conduct.html diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..ec44704c --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,121 @@ +## Contributing + +[The Carpentries][cp-site] ([Software Carpentry][swc-site], [Data +Carpentry][dc-site], and [Library Carpentry][lc-site]) are open source +projects, and we welcome contributions of all kinds: new lessons, fixes to +existing material, bug reports, and reviews of proposed changes are all +welcome. + +### Contributor Agreement + +By contributing, you agree that we may redistribute your work under [our +license](LICENSE.md). In exchange, we will address your issues and/or assess +your change proposal as promptly as we can, and help you become a member of our +community. Everyone involved in [The Carpentries][cp-site] agrees to abide by +our [code of conduct](CODE_OF_CONDUCT.md). + +### How to Contribute + +The easiest way to get started is to file an issue to tell us about a spelling +mistake, some awkward wording, or a factual error. This is a good way to +introduce yourself and to meet some of our community members. + +1. If you do not have a [GitHub][github] account, you can [send us comments by + email][contact]. However, we will be able to respond more quickly if you use + one of the other methods described below. + +2. If you have a [GitHub][github] account, or are willing to [create + one][github-join], but do not know how to use Git, you can report problems + or suggest improvements by [creating an issue][issues]. This allows us to + assign the item to someone and to respond to it in a threaded discussion. + +3. If you are comfortable with Git, and would like to add or change material, + you can submit a pull request (PR). Instructions for doing this are + [included below](#using-github). + +Note: if you want to build the website locally, please refer to [The Workbench +documentation][template-doc]. + +### Where to Contribute + +1. If you wish to change this lesson, add issues and pull requests here. +2. If you wish to change the template used for workshop websites, please refer + to [The Workbench documentation][template-doc]. + + +### What to Contribute + +There are many ways to contribute, from writing new exercises and improving +existing ones to updating or filling in the documentation and submitting [bug +reports][issues] about things that do not work, are not clear, or are missing. +If you are looking for ideas, please see [the list of issues for this +repository][repo], or the issues for [Data Carpentry][dc-issues], [Library +Carpentry][lc-issues], and [Software Carpentry][swc-issues] projects. + +Comments on issues and reviews of pull requests are just as welcome: we are +smarter together than we are on our own. **Reviews from novices and newcomers +are particularly valuable**: it's easy for people who have been using these +lessons for a while to forget how impenetrable some of this material can be, so +fresh eyes are always welcome. + +### What *Not* to Contribute + +Our lessons already contain more material than we can cover in a typical +workshop, so we are usually *not* looking for more concepts or tools to add to +them. As a rule, if you want to introduce a new idea, you must (a) estimate how +long it will take to teach and (b) explain what you would take out to make room +for it. The first encourages contributors to be honest about requirements; the +second, to think hard about priorities. + +We are also not looking for exercises or other material that only run on one +platform. Our workshops typically contain a mixture of Windows, macOS, and +Linux users; in order to be usable, our lessons must run equally well on all +three. + +### Using GitHub + +If you choose to contribute via GitHub, you may want to look at [How to +Contribute to an Open Source Project on GitHub][how-contribute]. In brief, we +use [GitHub flow][github-flow] to manage changes: + +1. Create a new branch in your desktop copy of this repository for each + significant change. +2. Commit the change in that branch. +3. Push that branch to your fork of this repository on GitHub. +4. Submit a pull request from that branch to the [upstream repository][repo]. +5. If you receive feedback, make changes on your desktop and push to your + branch on GitHub: the pull request will update automatically. + +NB: The published copy of the lesson is usually in the `main` branch. + +Each lesson has a team of maintainers who review issues and pull requests or +encourage others to do so. The maintainers are community volunteers, and have +final say over what gets merged into the lesson. + +### Other Resources + +The Carpentries is a global organisation with volunteers and learners all over +the world. We share values of inclusivity and a passion for sharing knowledge, +teaching and learning. There are several ways to connect with The Carpentries +community listed at including via social +media, slack, newsletters, and email lists. You can also [reach us by +email][contact]. + +[repo]: https://example.com/FIXME +[contact]: mailto:team@carpentries.org +[cp-site]: https://carpentries.org/ +[dc-issues]: https://github.com/issues?q=user%3Adatacarpentry +[dc-lessons]: https://datacarpentry.org/lessons/ +[dc-site]: https://datacarpentry.org/ +[discuss-list]: https://lists.software-carpentry.org/listinfo/discuss +[github]: https://github.com +[github-flow]: https://guides.github.com/introduction/flow/ +[github-join]: https://github.com/join +[how-contribute]: https://egghead.io/series/how-to-contribute-to-an-open-source-project-on-github +[issues]: https://carpentries.org/help-wanted-issues/ +[lc-issues]: https://github.com/issues?q=user%3ALibraryCarpentry +[swc-issues]: https://github.com/issues?q=user%3Aswcarpentry +[swc-lessons]: https://software-carpentry.org/lessons/ +[swc-site]: https://software-carpentry.org/ +[lc-site]: https://librarycarpentry.org/ +[template-doc]: https://carpentries.github.io/workbench/ diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 00000000..7632871f --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,79 @@ +--- +title: "Licenses" +--- + +## Instructional Material + +All Carpentries (Software Carpentry, Data Carpentry, and Library Carpentry) +instructional material is made available under the [Creative Commons +Attribution license][cc-by-human]. The following is a human-readable summary of +(and not a substitute for) the [full legal text of the CC BY 4.0 +license][cc-by-legal]. + +You are free: + +- to **Share**---copy and redistribute the material in any medium or format +- to **Adapt**---remix, transform, and build upon the material + +for any purpose, even commercially. + +The licensor cannot revoke these freedoms as long as you follow the license +terms. + +Under the following terms: + +- **Attribution**---You must give appropriate credit (mentioning that your work + is derived from work that is Copyright (c) The Carpentries and, where + practical, linking to ), provide a [link to the + license][cc-by-human], and indicate if changes were made. You may do so in + any reasonable manner, but not in any way that suggests the licensor endorses + you or your use. + +- **No additional restrictions**---You may not apply legal terms or + technological measures that legally restrict others from doing anything the + license permits. With the understanding that: + +Notices: + +* You do not have to comply with the license for elements of the material in + the public domain or where your use is permitted by an applicable exception + or limitation. +* No warranties are given. The license may not give you all of the permissions + necessary for your intended use. For example, other rights such as publicity, + privacy, or moral rights may limit how you use the material. + +## Software + +Except where otherwise noted, the example programs and other software provided +by The Carpentries are made available under the [OSI][osi]-approved [MIT +license][mit-license]. + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +## Trademark + +"The Carpentries", "Software Carpentry", "Data Carpentry", and "Library +Carpentry" and their respective logos are registered trademarks of [Community +Initiatives][ci]. + +[cc-by-human]: https://creativecommons.org/licenses/by/4.0/ +[cc-by-legal]: https://creativecommons.org/licenses/by/4.0/legalcode +[mit-license]: https://opensource.org/licenses/mit-license.html +[ci]: https://communityin.org/ +[osi]: https://opensource.org diff --git a/README.md b/README.md index f4d10c32..8b1004f1 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ -[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3266144.svg)](https://doi.org/10.5281/zenodo.3266144) +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3266144.svg)](https://doi.org/10.5281/zenodo.3266144) # Maintainers for Library Carpentry: OpenRefine -- [Erin Carrillo](https://github.com/partiecolored) (Lead) +- [Erin Carrillo](https://github.com/partiecolored) (Lead) - [Owen Stephens](https://github.com/ostephens) - [Paul R. Pival](https://github.com/ppival) - [Kristin Lee](https://github.com/kristindawn) @@ -17,7 +17,7 @@ Lesson Maintainers communication is via the [team site](https://github.com/orgs/ ## Library Carpentry -[Library Carpentry](https://librarycarpentry.org) is a software and data skills training programme for people working in library- and information-related roles. It builds on the work of [Software Carpentry](http://software-carpentry.org/) and [Data Carpentry](http://www.datacarpentry.org/). Library Carpentry is an official Lesson Program of [The Carpentries](https://carpentries.org/). +[Library Carpentry](https://librarycarpentry.org) is a software and data skills training programme for people working in library- and information-related roles. It builds on the work of [Software Carpentry](https://software-carpentry.org/) and [Data Carpentry](https://www.datacarpentry.org/). Library Carpentry is an official Lesson Program of [The Carpentries](https://carpentries.org/). ## License @@ -38,10 +38,12 @@ Library Carpentry is authored and maintained through issues, commits, and pull r ## Citation -Erin Carillo (Ed.), Owen Stephens (Ed.), Juliane Schneider (Ed.), Paul R. Pival (Ed.), Kristin Lee (Ed.), Carmi Cronje (Ed.), James Baker, Christopher Erdmann, Tim Dennis, mhidas, Daniel Bangert, Evan Williamson, … Jeffrey Oliver. (2019, July). LibraryCarpentry/lc-open-refine: Library Carpentry: OpenRefine, June 2019 (Version v2019.06.1). Zenodo. http://doi.org/10.5281/zenodo.3266144 +Erin Carillo (Ed.), Owen Stephens (Ed.), Juliane Schneider (Ed.), Paul R. Pival (Ed.), Kristin Lee (Ed.), Carmi Cronje (Ed.), James Baker, Christopher Erdmann, Tim Dennis, mhidas, Daniel Bangert, Evan Williamson, … Jeffrey Oliver. (2019, July). LibraryCarpentry/lc-open-refine: Library Carpentry: OpenRefine, June 2019 (Version v2019.06.1). Zenodo. [http://doi.org/10.5281/zenodo.3266144](https://doi.org/10.5281/zenodo.3266144) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3266144.svg)](https://doi.org/10.5281/zenodo.3266144) ## Checking and Previewing the Lesson -To check and preview a lesson locally, see [http://carpentries.github.io/lesson-example/07-checking/index.html](http://carpentries.github.io/lesson-example/07-checking/index.html). +To check and preview a lesson locally, see [http://carpentries.github.io/lesson-example/07-checking/index.html](https://carpentries.github.io/lesson-example/07-checking/index.html). + + diff --git a/config.yaml b/config.yaml new file mode 100644 index 00000000..9524cbcc --- /dev/null +++ b/config.yaml @@ -0,0 +1,94 @@ +#------------------------------------------------------------ +# Values for this lesson. +#------------------------------------------------------------ + +# Which carpentry is this (swc, dc, lc, or cp)? +# swc: Software Carpentry +# dc: Data Carpentry +# lc: Library Carpentry +# cp: Carpentries (to use for instructor training for instance) +# incubator: The Carpentries Incubator +carpentry: 'lc' + +# Overall title for pages. +title: 'Library Carpentry: OpenRefine' + +# Date the lesson was created (YYYY-MM-DD, this is empty by default) +created: + +# Comma-separated list of keywords for the lesson +keywords: 'software, data, lesson, The Carpentries' + +# Life cycle stage of the lesson +# possible values: pre-alpha, alpha, beta, stable +life_cycle: 'stable' + +# License of the lesson materials (recommended CC-BY 4.0) +license: 'CC-BY 4.0' + +# Link to the source repository for this lesson +source: 'https://github.com/fishtree-attempt/lc-open-refine/' + +# Default branch of your lesson +branch: 'main' + +# Who to contact if there are any issues +contact: 'team@carpentries.org' + +# Navigation ------------------------------------------------ +# +# Use the following menu items to specify the order of +# individual pages in each dropdown section. Leave blank to +# include all pages in the folder. +# +# Example ------------- +# +# episodes: +# - introduction.md +# - first-steps.md +# +# learners: +# - setup.md +# +# instructors: +# - instructor-notes.md +# +# profiles: +# - one-learner.md +# - another-learner.md + +# Order of episodes in your lesson +episodes: +- 01-introduction.md +- 02-importing-data.md +- 03-working-with-data.md +- 04-faceting-and-filtering.md +- 05-clustering.md +- 06-working-with-columns.md +- 07-introduction-to-transformations.md +- 08-writing-transformations.md +- 09-undo-and-redo.md +- 10-data-transformation.md +- 11-using-arrays-transformations.md +- 12-export-transformation.md +- 13-looking-up-data.md + +# Information for Learners +learners: + +# Information for Instructors +instructors: + +# Learner Profiles +profiles: + +# Customisation --------------------------------------------- +# +# This space below is where custom yaml items (e.g. pinning +# sandpaper and varnish versions) should live + + +url: https://preview.carpentries.org/lc-open-refine +analytics: carpentries +lang: en +workbench-beta: 'true' diff --git a/episodes/01-introduction.md b/episodes/01-introduction.md index 58ccb1a9..1f4c592b 100644 --- a/episodes/01-introduction.md +++ b/episodes/01-introduction.md @@ -1,72 +1,88 @@ --- -title: "Introduction to OpenRefine" +title: Introduction to OpenRefine teaching: 15 exercises: 0 -questions: -- "What is OpenRefine? What can it do?" -objectives: -- "Explain what the OpenRefine software does" -- "Explain how the OpenRefine software can help work with data files" -keypoints: -- "OpenRefine is 'a tool for working with messy data'" -- "OpenRefine works best with data in a simple tabular format" -- "OpenRefine can help you split data up into more granular parts" -- "OpenRefine can help you match local data up to other data sets" -- "OpenRefine can help you enhance a data set with data from other sources" --- +::::::::::::::::::::::::::::::::::::::: objectives + +- Explain what the OpenRefine software does +- Explain how the OpenRefine software can help work with data files + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- What is OpenRefine? What can it do? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + ## What is OpenRefine? - OpenRefine is a desktop application that uses your web browser as a graphical interface. It is described as "a power tool for working with messy data" ([David Huynh](http://web.archive.org/web/20141021040915/http://davidhuynh.net/spaces/nicar2011/tutorial.pdf)) - but what does this mean? It is probably easiest to describe the kinds of data OpenRefine is good at working with and the sorts of problems it can help you or your team solve. + +OpenRefine is a desktop application that uses your web browser as a graphical interface. It is described as "a power tool for working with messy data" ([David Huynh](https://web.archive.org/web/20141021040915/http://davidhuynh.net/spaces/nicar2011/tutorial.pdf)) - but what does this mean? It is probably easiest to describe the kinds of data OpenRefine is good at working with and the sorts of problems it can help you or your team solve. OpenRefine is most useful where you have data in a simple tabular format such as a spreadsheet, a comma separated values file (csv) or a tab delimited file (tsv) but with internal inconsistencies either in data formats, or where data appears, or in terminology used. OpenRefine can be used to standardize and clean data across your file. It can help you: -* Get an overview of a data set -* Resolve inconsistencies in a data set, for example standardizing date formatting -* Help you split data up into more granular parts, for example splitting up cells with multiple authors into separate cells -* Match local data up to other data sets - for example, in matching forms of personal names against name authority records in the Virtual International Authority File (VIAF) -* Enhance a data set with data from other sources +- Get an overview of a data set +- Resolve inconsistencies in a data set, for example standardizing date formatting +- Help you split data up into more granular parts, for example splitting up cells with multiple authors into separate cells +- Match local data up to other data sets - for example, in matching forms of personal names against name authority records in the Virtual International Authority File (VIAF) +- Enhance a data set with data from other sources Some common scenarios might be: -* Where you want to know how many times a particular value (name, publisher, subject) appears in a column in your data -* Where you want to know how values are distributed across your whole data set -* Where you have a list of dates which are formatted in different ways, and want to change all the dates in the list to a single common date format. For example: +- Where you want to know how many times a particular value (name, publisher, subject) appears in a column in your data +- Where you want to know how values are distributed across your whole data set +- Where you have a list of dates which are formatted in different ways, and want to change all the dates in the list to a single common date format. For example: -| Data you have | Desired data | -|-----------------|:-------------| -| 1st January 2014| 2014-01-01 | -| 01/01/2014 | 2014-01-01 | -| Jan 1 2014 | 2014-01-01 | -| 2014-01-01 | 2014-01-01 | +| Data you have | Desired data | +| ----------------------------------------------------------------------------------------------------------------------- | :------------------------------------------------------------- | +| 1st January 2014 | 2014-01-01 | +| 01/01/2014 | 2014-01-01 | +| Jan 1 2014 | 2014-01-01 | +| 2014-01-01 | 2014-01-01 | -* Where you have a list of names or terms that differ from each other but refer to the same people, places or concepts. For example: +- Where you have a list of names or terms that differ from each other but refer to the same people, places or concepts. For example: -| Data you have | Desired data | -|-----------------|:-------------| -| London | London | -| London] | London | -| London,] | London | -| london | London | +| Data you have | Desired data | +| ----------------------------------------------------------------------------------------------------------------------- | :------------------------------------------------------------- | +| London | London | +| London] | London | +| London,] | London | +| london | London | -* Where you have several bits of data combined together in a single column, and you want to separate them out into individual bits of data with one column for each bit of the data. For example going from a single address field (in the first column), to each part of the address in a separate field: +- Where you have several bits of data combined together in a single column, and you want to separate them out into individual bits of data with one column for each bit of the data. For example going from a single address field (in the first column), to each part of the address in a separate field: -| Address in single field | Institution | Library name | Address 1 | Address 2 | Town/City | Region | Country | Postcode | -|-------------------------|:-------------|:-------------|:-------------|:-------------|:-------------|:-------------|:-------------|:-------------| -| University of Wales, Llyfrgell Thomas Parry Library, Llanbadarn Fawr, ABERYSTWYTH, Ceredigion, SY23 3AS, United Kingdom | University of Wales | Llyfrgell Thomas Parry Library | Llanbadarn Fawr | | Aberystwyth | Ceredigion | United Kingdom | SY23 3AS | -| University of Aberdeen, Queen Mother Library, Meston Walk, ABERDEEN, AB24 3UE, United Kingdom | University of Abderdeen | Queen Mother Library | Meston Walk | | Aberdeen | | United Kingdom | AB24 3UE | -| University of Birmingham, Barnes Library, Medical School, Edgbaston, BIRMINGHAM, West Midlands, B15 2TT, United Kingdom | University of Birmingham | Barnes Library | Medical School | Edgbaston | Birmingham | West Midlands | United Kingdom | B15 2TT | -| University of Warwick, Library, Gibbett Hill Road, COVENTRY, CV4 7AL, United Kingdom | University of Warwick | Library | Gibbett Hill Road | | Coventry | | United Kingdom | CV4 7AL | +| Address in single field | Institution | Library name | Address 1 | Address 2 | Town/City | Region | Country | Postcode | +| ----------------------------------------------------------------------------------------------------------------------- | :------------------------------------------------------------- | :------------------------------------------------------------- | :---------------- | :-------- | :---------- | :------------ | :------------- | :------- | +| University of Wales, Llyfrgell Thomas Parry Library, Llanbadarn Fawr, ABERYSTWYTH, Ceredigion, SY23 3AS, United Kingdom | University of Wales | Llyfrgell Thomas Parry Library | Llanbadarn Fawr | | Aberystwyth | Ceredigion | United Kingdom | SY23 3AS | +| University of Aberdeen, Queen Mother Library, Meston Walk, ABERDEEN, AB24 3UE, United Kingdom | University of Abderdeen | Queen Mother Library | Meston Walk | | Aberdeen | | United Kingdom | AB24 3UE | +| University of Birmingham, Barnes Library, Medical School, Edgbaston, BIRMINGHAM, West Midlands, B15 2TT, United Kingdom | University of Birmingham | Barnes Library | Medical School | Edgbaston | Birmingham | West Midlands | United Kingdom | B15 2TT | +| University of Warwick, Library, Gibbett Hill Road, COVENTRY, CV4 7AL, United Kingdom | University of Warwick | Library | Gibbett Hill Road | | Coventry | | United Kingdom | CV4 7AL | -* Where you want to add to your data from an external data source: +- Where you want to add to your data from an external data source: -| Data you have | Date of Birth from VIAF (Virtual International Authority File) | Date of Death from VIAF (Virtual International Authority File) | -|-----------------|:-------------|:-------------| -| Braddon, M. E. (Mary Elizabeth) | 1835 | 1915 | -| Rossetti, William Michael | 1829 | 1919 | -| Prest, Thomas Peckett | 1810 | 1879 | +| Data you have | Date of Birth from VIAF (Virtual International Authority File) | Date of Death from VIAF (Virtual International Authority File) | +| ----------------------------------------------------------------------------------------------------------------------- | :------------------------------------------------------------- | :------------------------------------------------------------- | +| Braddon, M. E. (Mary Elizabeth) | 1835 | 1915 | +| Rossetti, William Michael | 1829 | 1919 | +| Prest, Thomas Peckett | 1810 | 1879 | ## What Should I Know When Working With OpenRefine? -* No internet connection is needed, and none of the data or commands you enter in OpenRefine are sent to a remote server. -* You are NOT modifying original/raw data. -* Projects are autosaved every five minutes and when OpenRefine is properly shut down (Ctrl+C). See [History in User Manual](https://docs.openrefine.org/manual/running/#history-undoredo) for details. -* Files are saved locally such that if you are working on two computers you will have to export/import files/projects. + +- No internet connection is needed, and none of the data or commands you enter in OpenRefine are sent to a remote server. +- You are NOT modifying original/raw data. +- Projects are autosaved every five minutes and when OpenRefine is properly shut down (Ctrl+C). See [History in User Manual](https://docs.openrefine.org/manual/running/#history-undoredo) for details. +- Files are saved locally such that if you are working on two computers you will have to export/import files/projects. + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- OpenRefine is 'a tool for working with messy data' +- OpenRefine works best with data in a simple tabular format +- OpenRefine can help you split data up into more granular parts +- OpenRefine can help you match local data up to other data sets +- OpenRefine can help you enhance a data set with data from other sources + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/episodes/02-importing-data.md b/episodes/02-importing-data.md index f37ed0f2..926e732e 100644 --- a/episodes/02-importing-data.md +++ b/episodes/02-importing-data.md @@ -1,17 +1,21 @@ --- -title: "Importing data into OpenRefine" +title: Importing data into OpenRefine teaching: 10 exercises: 5 -questions: -- "How do I get data into OpenRefine?" -objectives: -- "Successfully import data into OpenRefine" -keypoints: -- "Use the `Create Project` option to import data" -- "You can control how data imports using options on the import screen" -- "Several files types may be imported into OpenRefine." --- +::::::::::::::::::::::::::::::::::::::: objectives + +- Successfully import data into OpenRefine + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- How do I get data into OpenRefine? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + ## Importing data OpenRefine does not manipulate your data directly. @@ -24,38 +28,58 @@ computer. To do so, you transfer the exported files to the new computer and use "Import Project" on the new computer. ->## What kinds of data files can I import? ->There are several options for getting your data set into OpenRefine. You can upload or import files in a variety of formats including: -> ->* TSV (tab-separated values) ->* CSV (comma-separated values) ->* TXT ->* Excel ->* JSON (javascript object notation) ->* XML (extensible markup language) ->* Google Spreadsheet -{: .callout} - ->## Create your first OpenRefine project (using provided data) -> -> To import the data for the exercise below, follow the instructions in [Setup](https://librarycarpentry.github.io/lc-open-refine/setup.html) to download the data and run OpenRefine. *NOTE: If OpenRefine does not open in a browser window, open your browser and type the address to take you to the OpenRefine interface.* -> ->1. Once OpenRefine is launched in your browser, click `Create Project` from the left hand menu and select `Get data from This Computer` ->2. Click `Choose Files` (or 'Browse', depending on your setup) and locate the file which you have downloaded called `doaj-article-sample.csv` ->3. Click `Next»` where the next screen (see below) gives you options to ensure the data is imported into OpenRefine correctly. The options vary depending on the type of data you are importing. ->4. Click in the `Character encoding` box and set it to `UTF-8`. This ensures that OpenRefine correctly interprets the imported data as UTF-8 encoded. If you don't select this you may find that some special characters (e.g. smart quotation marks) are not displayed correctly. ->5. Ensure the first row is used to create the column headings by checking the box `Parse next 1 line(s) as column headers` ->6. OpenRefine will automatically select “Use character” to enclose cells containing column separators (such as a comma) as part of their data. This will make sure that OpenRefine doesn't misinterpret any commas (or other characters) within the column data as a delimiter. Keep this option selected. ->7. From OpenRefine 3.4 onwards there is an option to Trim leading & trailing whitespace from strings when importing separator-based files. Keeping this checked will ensure that values like `English` and `English `, which differ by a single trailing space, are not treated as different values after the import ->8. Make sure the `Attempt to parse cell text into numbers` box is not checked, so OpenRefine doesn't try to automatically detect numbers because this could cause errors such as confusion between date formats (e.g. DD/MM/YYYY vs MM/DD/YYYY). ->9. The Project Name box in the upper right corner will default to the title of your imported file. Click in the `Project Name` box to give your project a different name, if desired. ->10. Once you have selected the appropriate options for your project, click the `Create project »` button at the top right of the screen. This will create the project and open it for you. Projects are saved as you work on them, there is no need to save copies as you go along. -> -> ![Screenshot of Open Refine Create Project Screen](../assets/img/openrefine_ui.png) -{: .checklist} +::::::::::::::::::::::::::::::::::::::::: callout + +## What kinds of data files can I import? + +There are several options for getting your data set into OpenRefine. You can upload or import files in a variety of formats including: + +- TSV (tab-separated values) +- CSV (comma-separated values) +- TXT +- Excel +- JSON (javascript object notation) +- XML (extensible markup language) +- Google Spreadsheet + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::::: checklist + +## Create your first OpenRefine project (using provided data) + +To import the data for the exercise below, follow the instructions in [Setup](https://librarycarpentry.github.io/lc-open-refine/setup.html) to download the data and run OpenRefine. *NOTE: If OpenRefine does not open in a browser window, open your browser and type the address [http://127.0.0.1:3333/](https://127.0.0.1:3333/) to take you to the OpenRefine interface.* + +1. Once OpenRefine is launched in your browser, click `Create Project` from the left hand menu and select `Get data from This Computer` +2. Click `Choose Files` (or 'Browse', depending on your setup) and locate the file which you have downloaded called `doaj-article-sample.csv` +3. Click `Next»` where the next screen (see below) gives you options to ensure the data is imported into OpenRefine correctly. The options vary depending on the type of data you are importing. +4. Click in the `Character encoding` box and set it to `UTF-8`. This ensures that OpenRefine correctly interprets the imported data as UTF-8 encoded. If you don't select this you may find that some special characters (e.g. smart quotation marks) are not displayed correctly. +5. Ensure the first row is used to create the column headings by checking the box `Parse next 1 line(s) as column headers` +6. OpenRefine will automatically select "Use character" to enclose cells containing column separators (such as a comma) as part of their data. This will make sure that OpenRefine doesn't misinterpret any commas (or other characters) within the column data as a delimiter. Keep this option selected. +7. From OpenRefine 3.4 onwards there is an option to Trim leading \& trailing whitespace from strings when importing separator-based files. Keeping this checked will ensure that values like `English` and `English `, which differ by a single trailing space, are not treated as different values after the import +8. Make sure the `Attempt to parse cell text into numbers` box is not checked, so OpenRefine doesn't try to automatically detect numbers because this could cause errors such as confusion between date formats (e.g. DD/MM/YYYY vs MM/DD/YYYY). +9. The Project Name box in the upper right corner will default to the title of your imported file. Click in the `Project Name` box to give your project a different name, if desired. +10. Once you have selected the appropriate options for your project, click the `Create project »` button at the top right of the screen. This will create the project and open it for you. Projects are saved as you work on them, there is no need to save copies as you go along. + +![](fig/openrefine_ui.png){alt='Screenshot of Open Refine Create Project Screen'} + + +:::::::::::::::::::::::::::::::::::::::::::::::::: To open an existing project in OpenRefine you can click `Open Project` from the main OpenRefine screen (in the left hand menu). When you click this, you will see a list of the existing projects and can click on a project's name to open it. ### Going Further -* Look at the other options on the Import screen - try changing some of these options and see how that changes the Preview and how the data appears after import. -* Do you have access to JSON or XML data? If so the first stage of the import process will prompt you to select a 'record path' - that is the parts of the file that will form the data rows in the OpenRefine project. + +- Look at the other options on the Import screen - try changing some of these options and see how that changes the Preview and how the data appears after import. +- Do you have access to JSON or XML data? If so the first stage of the import process will prompt you to select a 'record path' - that is the parts of the file that will form the data rows in the OpenRefine project. + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- Use the `Create Project` option to import data +- You can control how data imports using options on the import screen +- Several files types may be imported into OpenRefine. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/episodes/03-working-with-data.md b/episodes/03-working-with-data.md index 55ccb988..6bb5f4b4 100644 --- a/episodes/03-working-with-data.md +++ b/episodes/03-working-with-data.md @@ -1,54 +1,58 @@ --- -title: "Layout of OpenRefine, Rows vs Records" -teaching: 10 +title: Layout of OpenRefine, Rows vs Records +teaching: 10 exercises: 5 -questions: -- "How is data organised in OpenRefine?" -- "How do I access options to amend data in OpenRefine?" -- "What is the difference between Rows and Records in OpenRefine?" -- "How do I work with single cells that contain multiple values in a list?" -objectives: +--- + +::::::::::::::::::::::::::::::::::::::: objectives + - Locate controls for navigating data in OpenRefine - Find options to work with data through the OpenRefine dropdown menus - Split cells which contain multiple bits of data so that each piece of data is in its own cell -keypoints: -- "OpenRefine uses rows and columns to display data" -- "Most options to work with data in OpenRefine are accessed through a drop down menu at the top of a data column" -- "When you select an option in a particular column (e.g. to make a change to the data), it will affect all the cells in that column" -- "OpenRefine has a Records mode which links together multiple rows into a single record" -- "Split and join multi-valued cells to modify the individual values within them" -- "When creating multi-valued cells in your data, choose a separator that will not appear in the data values" ---- + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- How is data organised in OpenRefine? +- How do I access options to amend data in OpenRefine? +- What is the difference between Rows and Records in OpenRefine? +- How do I work with single cells that contain multiple values in a list? + +:::::::::::::::::::::::::::::::::::::::::::::::::: ## The layout of OpenRefine + OpenRefine displays data in a tabular format. Each row will usually represent a 'record' in the data, while each column represents a type of information. This is very similar to how you might view data in a spreadsheet or database. As with a spreadsheet, the individual bits of data live in 'cells' at the intersection of a row and a column. OpenRefine only displays a limited number of rows of data at one time. You can adjust the number choosing between 5, 10 (the default), 25 and 50 at the top left of the table of data. You can navigate through the records by using the previous/next/first/last navigation options at the top right of the table of data. In OpenRefine it is always possible to undo any changes: note the left panel, currently empty. Read the four words at the top of the panel: -Facet/Filter and Undo/Redo. We will focus on the undo/redo commands much later in the workshop; you are welcome to use it at anytime, as needed. +Facet/Filter and Undo/Redo. We will focus on the undo/redo commands much later in the workshop; you are welcome to use it at anytime, as needed. ## Working with data in OpenRefine + Most options to work with data in OpenRefine are accessed from drop down menus at the top of the data columns. When you select an option in a particular column (e.g. to make a change to the data), it will affect all the cells in that column. If you want to make changes across several columns, you will need to do this one column at a time. ## Rows and Records + OpenRefine has two modes of viewing data: 'Rows' and 'Records'. At the moment we are in Rows mode, where each row represents a single record in the data set - in this case, an article. In Records mode, OpenRefine can link together multiple rows as belonging to the same Record. Rows will be assigned to Records based on the values in the first column. See more [details of Rows and Records in the OpenRefine documentation](https://docs.openrefine.org/manual/exploring#rows-vs-records). ### Splitting Cells -To see how this works in practice we can split author names into separate cells. If you look at the Author column you should be able to see that there are multiple names in each cell separated by the pipe symbol ( \| ). +To see how this works in practice we can split author names into separate cells. If you look at the Author column you should be able to see that there are multiple names in each cell separated by the pipe symbol ( | ). To work with the author names effectively in OpenRefine, we need to have each name in an individual cell. To split the names into their own cells, we can use a `Split multi-valued cells` function: -* Click the dropdown menu at the top of the Author column -* Choose `Edit cells->Split multi-valued cells` -* In the prompt type the ( \| ) symbol and click `OK` - * Note that the rows are still numbered sequentially -* Click the `Records` option to change to Records mode - * Note how the numbering has changed - indicating that several rows are related to the same record +- Click the dropdown menu at the top of the Author column +- Choose `Edit cells->Split multi-valued cells` +- In the prompt type the ( | ) symbol and click `OK` + - Note that the rows are still numbered sequentially +- Click the `Records` option to change to Records mode + - Note how the numbering has changed - indicating that several rows are related to the same record - ![Screen capture showing OpenRefine in Rows mode.](../assets/img/rows.png) - ![Screen capture showing OpenRefine in Rows mode.](../assets/img/records.png) +![](fig/rows.png){alt='Screen capture showing OpenRefine in Rows mode.'} +![](fig/records.png){alt='Screen capture showing OpenRefine in Rows mode.'} Note in the images above the difference between: Rows with the same Title appear below each shared title, interrupted the numbered sequence in the third column from the left. Shared titles have the same shading, which may be very difficult to distinguish visually, so look for each star and flag in the leftmost columns, which indicates a new row, that is an item with a different author. @@ -64,29 +68,30 @@ A common workflow with multi-valued cells is Modifying cells will be covered in future lessons, but for now we will cover how to join cells back together that have been split previously. -* Click the dropdown menu at the top of the Author column -* Choose `Edit cells->Join multi-valued cells` -* In the prompt type the ( \| ) symbol - * Here we are specifying the *delimiter* character for OpenRefine to use to join the values together. -* Click `OK` to join the Authors cells back together +- Click the dropdown menu at the top of the Author column +- Choose `Edit cells->Join multi-valued cells` +- In the prompt type the ( | ) symbol + - Here we are specifying the *delimiter* character for OpenRefine to use to join the values together. +- Click `OK` to join the Authors cells back together You will now see that split rows have gone away - the Authors have been joined into a single cell with the specified delimiter. Our Rows and Records values will now be the same since we do not have any more columns with split (multi-valued) cells. -* Click both the `Rows` and `Records` options and observe how the numbers of Rows and Records are equal +- Click both the `Rows` and `Records` options and observe how the numbers of Rows and Records are equal ### Choosing a good separator The value that separates multi-valued cells is called a separator or delimiter. Choosing a good -separator is important. In the examples, we've seen the pipe character ( \| ) has been used. +separator is important. In the examples, we've seen the pipe character ( | ) has been used. Choosing the wrong separator can lead to problems. Consider the following multi-valued Author example, with a pipe as a separator. + ``` Jones, Andrew | Davis, S. ``` -When we tell OpenRefine to split this cell on the pipe ( \| ), we will get the following two authors each in their own cell since there is a single pipe character separating them. +When we tell OpenRefine to split this cell on the pipe ( | ), we will get the following two authors each in their own cell since there is a single pipe character separating them. - **Author 1:** Jones, Andrew - **Author 2:** Davis, S. @@ -97,7 +102,7 @@ Now imagine that the document creator had chosen a **comma** as the separator in Jones, Andrew , Davis, S. ``` -Can you spot the problem? Can you tell where one author stops and the next begins? +Can you spot the problem? Can you tell where one author stops and the next begins? OpenRefine will split on **every** comma it encounters, so we'll end up with 4 authors, not two, because OpenRefine cannot tell that **Jones, Andrew** is supposed to be a single author. We will get @@ -110,36 +115,73 @@ the following four "authors" because there are 3 commas separating the name part Splitting on a comma will not work with Authors because the names may include commas within them. -> ## Choose a separator that is not in your data values -> -> When creating a spreadsheet with multi-valued cells, it is important to choose a separator that will never appear in -> the cell values themselves. For this reason, the pipe character ( \| ) is often a good choice since it -> is rarely used in data. Commas, colons and semi-colons should be avoided as separators. -{: .callout} - ->## Splitting Subjects into separate cells -> ->1. What separator character is used in the Subjects cells? ->2. How would you split these subjects into individual cells? -> -> > ## Solution -> > 1. The subject words/headings are divided up with the pipe ( \| ) character -> > 2. To split the subject words into individual cells you need to: -> > * Click the dropdown menu at the top of the Subjects column -> > * Choose 'Edit cells->Split multi-valued cells' -> > * In the prompt type the ( \| ) symbol and click 'OK' -> {: .solution} -{: .challenge} - ->## Joining the Subjects column back together -> ->1. Using what we've learned, now Join the Subjects back together -> -> > ## Solution -> > 1. The subject words/headings were previously delimited with the pipe ( \| ) character -> > 2. To join the split subject cells back to a single cell you need to: -> > * Click the dropdown menu at the top of the Subjects column -> > * Choose 'Edit cells->Join multi-valued cells' -> > * In the prompt type the ( \| ) symbol and click 'OK' -> {: .solution} -{: .challenge} +::::::::::::::::::::::::::::::::::::::::: callout + +## Choose a separator that is not in your data values + +When creating a spreadsheet with multi-valued cells, it is important to choose a separator that will never appear in +the cell values themselves. For this reason, the pipe character ( | ) is often a good choice since it +is rarely used in data. Commas, colons and semi-colons should be avoided as separators. + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::::: challenge + +## Splitting Subjects into separate cells + +1. What separator character is used in the Subjects cells? +2. How would you split these subjects into individual cells? + +::::::::::::::: solution + +## Solution + +1. The subject words/headings are divided up with the pipe ( | ) character +2. To split the subject words into individual cells you need to: + +- Click the dropdown menu at the top of the Subjects column +- Choose 'Edit cells->Split multi-valued cells' +- In the prompt type the ( | ) symbol and click 'OK' + + + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::::: challenge + +## Joining the Subjects column back together + +1. Using what we've learned, now Join the Subjects back together + +::::::::::::::: solution + +## Solution + +1. The subject words/headings were previously delimited with the pipe ( | ) character +2. To join the split subject cells back to a single cell you need to: + +- Click the dropdown menu at the top of the Subjects column +- Choose 'Edit cells->Join multi-valued cells' +- In the prompt type the ( | ) symbol and click 'OK' + + + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- OpenRefine uses rows and columns to display data +- Most options to work with data in OpenRefine are accessed through a drop down menu at the top of a data column +- When you select an option in a particular column (e.g. to make a change to the data), it will affect all the cells in that column +- OpenRefine has a Records mode which links together multiple rows into a single record +- Split and join multi-valued cells to modify the individual values within them +- When creating multi-valued cells in your data, choose a separator that will not appear in the data values + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/episodes/04-faceting-and-filtering.md b/episodes/04-faceting-and-filtering.md index a4eb1760..4c2d7f0a 100644 --- a/episodes/04-faceting-and-filtering.md +++ b/episodes/04-faceting-and-filtering.md @@ -1,24 +1,29 @@ --- -title: "Faceting and filtering" +title: Faceting and filtering teaching: 10 exercises: 10 -questions: -- "What is a facet in OpenRefine?" -- "What is a filter in OpenRefine?" -- "How can I use filters and facets to explore data in OpenRefine?" -- "How can I correct common data issues in my data with OpenRefine?" -objectives: -- "Explain what Facets and Filters are" -- "Answer questions about the content of a data set using Facets" -- "Use facets and filters to work with a subset of data" -- "Correct data problems through a facet" -keypoints: -- "You can use facets and filters to explore your data" -- "You can use facets and filters work with a subset of data in OpenRefine" -- "You can correct common data issues from a Facet" --- +::::::::::::::::::::::::::::::::::::::: objectives + +- Explain what Facets and Filters are +- Answer questions about the content of a data set using Facets +- Use facets and filters to work with a subset of data +- Correct data problems through a facet + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- What is a facet in OpenRefine? +- What is a filter in OpenRefine? +- How can I use filters and facets to explore data in OpenRefine? +- How can I correct common data issues in my data with OpenRefine? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + ## Facets + Facets are one of the most useful features of OpenRefine and can help in both getting an overview of the data and to improve the consistency of the data. A 'Facet' groups all the values that appear in a column, and then allows you to filter the data by these values and edit values across many records at the same time. @@ -33,29 +38,45 @@ You can include multiple values from the facet in a filter at one time by using You can also `invert` the filter to show all records which do not match your selected values. This option appears at the top of the Facet panel when you select a value from the facet to apply as a filter. ->## Let's create a text facet ->1. Click on the drop down menu at the top of the publisher column and choose `Facet > Text Facet`. The facet will then appear in the left hand panel ->2. To select a single value, click the text of the relevant line in the facet ->3. To select multiple values click the `Include` option on the appropriate line in the facet (which only appears when you mouse over the line) ->3. You can 'invert' your selections to `exclude` ->4. Include a value and then look at top to invert inclusion. -{: .checklist} - ->## Which licences are used for articles in this file? -> Use a `text facet` for the `licence` column and answer these questions: -> ->1. What is the most common Licence in the file? ->2. How many articles in the file don't have a licence assigned? -> ->>## Solution ->>1. Create a facet for the 'Licence' column ->>2. Sort values by `count` ->>3. What is the most common Licence in the file? Answer: `CC BY` ->>4. How many articles in the file don't have a licence assigned? Answer: **6** ->{: .solution} -{: .challenge} +::::::::::::::::::::::::::::::::::::::: checklist + +## Let's create a text facet + +1. Click on the drop down menu at the top of the publisher column and choose `Facet > Text Facet`. The facet will then appear in the left hand panel +2. To select a single value, click the text of the relevant line in the facet +3. To select multiple values click the `Include` option on the appropriate line in the facet (which only appears when you mouse over the line) +4. You can 'invert' your selections to `exclude` +5. Include a value and then look at top to invert inclusion. + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::::: challenge + +## Which licences are used for articles in this file? + +Use a `text facet` for the `licence` column and answer these questions: + +1. What is the most common Licence in the file? +2. How many articles in the file don't have a licence assigned? + +::::::::::::::: solution + +## Solution + +1. Create a facet for the 'Licence' column +2. Sort values by `count` +3. What is the most common Licence in the file? Answer: `CC BY` +4. How many articles in the file don't have a licence assigned? Answer: **6** + + + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: ## Filters + As well as using Facets to filter the data displayed in OpenRefine you can also apply 'Text Filters' which looks for a particular piece of text appearing in a column based on a unique text string, like a 'find' feature. Text filters are applied by clicking the drop down menu at the top of the column you want to apply the filter to and choosing 'Text filter'. As with Facets, the Filter options appear in the left hand panel in OpenRefine. As you type the text you want to use in the Filter in the Filter's text box, OpenRefine works to display only rows that contain that text in the relevant column. @@ -63,58 +84,88 @@ As with Facets, the Filter options appear in the left hand panel in OpenRefine. You can also use [regular expressions](https://librarycarpentry.github.io/lc-data-intro/01-regular-expressions/) in the filter. ## Working with filtered data + It is very important to note that when you have filtered the data displayed in OpenRefine, any operations you carry out will apply only to the rows that match the filter - that is the data currently being displayed. To confirm you are working with the data you intended to select, check the number of matching records displayed above the data table. ## Other types of Facet -As well as 'Text facets' OpenRefine also supports a range of other types of facet. These include: -* Numeric facets -* Timeline facets (for dates) -* Scatterplot facets -* Custom facets +As well as 'Text facets' OpenRefine also supports a range of other types of facet. These include: +- Numeric facets +- Timeline facets (for dates) +- Scatterplot facets +- Custom facets **Numeric and Timeline facets** display graphs instead of lists of values. The graph includes 'drag and drop' controls you can use to set a start and end range to filter the data displayed. -**Scatterplot facets** are less commonly used. For further information on these see the tutorial at [https://web.archive.org/web/20190105063215/http://enipedia.tudelft.nl/wiki/OpenRefine_Tutorial#Exploring_the_data_with_scatter_plots](https://web.archive.org/web/20190105063215/http://enipedia.tudelft.nl/wiki/OpenRefine_Tutorial#Exploring_the_data_with_scatter_plots). +**Scatterplot facets** are less commonly used. For further information on these see the tutorial at [https://web.archive.org/web/20190105063215/http://enipedia.tudelft.nl/wiki/OpenRefine\_Tutorial#Exploring\_the\_data\_with\_scatter\_plots](https://web.archive.org/web/20190105063215/http://enipedia.tudelft.nl/wiki/OpenRefine_Tutorial#Exploring_the_data_with_scatter_plots). **Custom facets** are a range of different types of facets. Some of the default custom facets are: -* Word facet - this breaks down text into words and counts the number of records each word appears in -* Duplicates facet - this results in a binary facet of 'true' or 'false'. Rows appear in the 'true' facet if the value in the selected column is an exact match for a value in the same column in another row -* Text length facet - creates a numeric facet based on the length (number of characters) of the text in each row for the selected column. This can be useful for spotting incorrect or unusual data in a field where specific lengths are expected (e.g. if the values are expected to be years, any row with a text length more than 4 for that column is likely to be incorrect) -* Facet by blank - a binary facet of 'true' or 'false'. Rows appear in the 'true' facet if they have no data present in that column. This is useful when looking for rows missing key data. +- Word facet - this breaks down text into words and counts the number of records each word appears in +- Duplicates facet - this results in a binary facet of 'true' or 'false'. Rows appear in the 'true' facet if the value in the selected column is an exact match for a value in the same column in another row +- Text length facet - creates a numeric facet based on the length (number of characters) of the text in each row for the selected column. This can be useful for spotting incorrect or unusual data in a field where specific lengths are expected (e.g. if the values are expected to be years, any row with a text length more than 4 for that column is likely to be incorrect) +- Facet by blank - a binary facet of 'true' or 'false'. Rows appear in the 'true' facet if they have no data present in that column. This is useful when looking for rows missing key data. Facets are intended to group together common values and OpenRefine limits the number of values allowed in a single facet to ensure the software does not perform slowly or run out of memory. If you create a facet where there are many unique values (for example, a facet on a 'book title' column in a data set that has one row per book) the facet created will be very large and may either slow down the application, or OpenRefine will not create the facet. ->## Find all publications without a DOI ->* Use the `Facet by blank` function to find all publications in this data set without a DOI -> ->>## Solution ->> ->>1. On the `DOI` column drop down and select `Facets > Customized facets > Facet by blank` ->>2. `True` means that it is blank, so you can: ->> * Select `include` on True in the facet to filter the list of publications to only those that don't have a DOI ->{: .solution} -{: .challenge} +::::::::::::::::::::::::::::::::::::::: challenge + +## Find all publications without a DOI + +- Use the `Facet by blank` function to find all publications in this data set without a DOI + +::::::::::::::: solution + +## Solution + +1. On the `DOI` column drop down and select `Facets > Customized facets > Facet by blank` +2. `True` means that it is blank, so you can: + - Select `include` on True in the facet to filter the list of publications to only those that don't have a DOI + + + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: ## Amending data through facets + If you create a text facet you can edit the values in the facet to change the value for several records at the same time. To do this, mouse-over the value you want to edit and click the 'edit' option that appears. This approach is useful in relatively small facets where you might have small variations through punctuation or typing errors etc. For example, a column that should contain only terms from a small restricted list such as days of the week or months of the year. The list of values in the facet will update as you make edits. ->## Correct the Language values via a facet -> ->* Create a `Text facet` on the `language` column and correct the variation in the `EN` and `English` values. -> ->>## Solution ->>1. Create a Text facet on the Language column ->>2. Notice that there is both `EN` and `English` ->>3. Put the mouse over the `English` value ->>4. Click `Edit` ->>5. Type `EN` and click `Apply` ->>6. See how the Language facet updates ->{: .solution} -{: .challenge} +::::::::::::::::::::::::::::::::::::::: challenge + +## Correct the Language values via a facet + +- Create a `Text facet` on the `language` column and correct the variation in the `EN` and `English` values. + +::::::::::::::: solution + +## Solution + +1. Create a Text facet on the Language column +2. Notice that there is both `EN` and `English` +3. Put the mouse over the `English` value +4. Click `Edit` +5. Type `EN` and click `Apply` +6. See how the Language facet updates + + + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- You can use facets and filters to explore your data +- You can use facets and filters work with a subset of data in OpenRefine +- You can correct common data issues from a Facet + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/episodes/05-clustering.md b/episodes/05-clustering.md index 0c892f89..75c205b7 100644 --- a/episodes/05-clustering.md +++ b/episodes/05-clustering.md @@ -1,21 +1,25 @@ --- -title: "Clustering" +title: Clustering teaching: 10 exercises: 10 -questions: -- "What is Clustering in OpenRefine and when would you use it?" -- "How does clustering work in OpenRefine?" -objectives: -- "Explain what clustering is in OpenRefine" -- "Use clustering to identify and replace varying forms of the same data with a single consistent value" -keypoints: -- "Clustering is a way of finding variant forms of the same piece of data within a dataset (e.g. different spellings of a name)" -- "There are a number of different Clustering algorithms that work in different ways and will produce different results" -- "The best clustering algorithm to use will depend on the data" -- "Using clustering you can replace varying forms of the same data with a single consistent value" --- +::::::::::::::::::::::::::::::::::::::: objectives + +- Explain what clustering is in OpenRefine +- Use clustering to identify and replace varying forms of the same data with a single consistent value + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- What is Clustering in OpenRefine and when would you use it? +- How does clustering work in OpenRefine? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + ## Clustering + The Cluster function groups together similar, but inconsistent values in a given column and lets you merge these inconsistent values into a single value you choose. This is very effective where you have data with minor variations in data values, e.g. names of people, organisations, places, classification terms. @@ -28,10 +32,25 @@ For more information on the methods used to create Clusters, see [https://github For each cluster, you have the option of 'merging' the values together - that is, replace the various inconsistent values with a single consistent value. By default, OpenRefine uses the most common value in the cluster as the new value, but you can select another value by clicking the value itself, or you can type the desired value into the 'New Cell Value' box. ->## Use Clustering to clean up author data -> ->1. Split out the author names into individual cells using `Edit cells -> Split multi-valued cells`, using the pipe ( \| ) character as the separator ->2. Choose `Edit cells -> Cluster and edit` from the 'author' column. ->3. Using the `key collision` method with the `fingerprint` Keying Function, work through the clusters of values, merging them to a single value where appropriate ->4. Try changing the clustering method being used - which ones work well? -{: .challenge} +::::::::::::::::::::::::::::::::::::::: challenge + +## Use Clustering to clean up author data + +1. Split out the author names into individual cells using `Edit cells -> Split multi-valued cells`, using the pipe ( | ) character as the separator +2. Choose `Edit cells -> Cluster and edit` from the 'author' column. +3. Using the `key collision` method with the `fingerprint` Keying Function, work through the clusters of values, merging them to a single value where appropriate +4. Try changing the clustering method being used - which ones work well? + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- Clustering is a way of finding variant forms of the same piece of data within a dataset (e.g. different spellings of a name) +- There are a number of different Clustering algorithms that work in different ways and will produce different results +- The best clustering algorithm to use will depend on the data +- Using clustering you can replace varying forms of the same data with a single consistent value + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/episodes/06-working-with-columns.md b/episodes/06-working-with-columns.md index 7b651848..9d5809d4 100644 --- a/episodes/06-working-with-columns.md +++ b/episodes/06-working-with-columns.md @@ -1,39 +1,55 @@ --- -title: "Working with columns and sorting" +title: Working with columns and sorting teaching: 5 exercises: 5 -questions: -- "How do I move, rename or remove columns in OpenRefine?" -- "How do I sort data in OpenRefine?" -objectives: -- "Explain how to reorder, rename and remove columns" -- "Explain how to sort data in columns" -keypoints: -- "You can reorder, rename and remove columns in OpenRefine" -- "Sorting in OpenRefine always sorts all rows" -- "The original order of rows in OpenRefine is maintained during a sort until you use the option to Reorder Rows Permanently from the Sort drop-down menu" --- +::::::::::::::::::::::::::::::::::::::: objectives + +- Explain how to reorder, rename and remove columns +- Explain how to sort data in columns + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- How do I move, rename or remove columns in OpenRefine? +- How do I sort data in OpenRefine? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + ## Reordering columns + You can reorder the columns by clicking the drop-down menu at the top of the first column (labelled 'All'), and choosing 'Edit columns' > 'Re-order / remove columns …'. You can then drag and drop column names to reorder the columns, or remove columns completely if they are not required. ## Renaming columns -You can rename a column by opening the drop-down menu at the top of the column that you would like to rename, and choosing 'Edit column' > 'Rename this column'. You will then be prompted to enter the new column name. +You can rename a column by opening the drop-down menu at the top of the column that you would like to rename, and choosing 'Edit column' > 'Rename this column'. You will then be prompted to enter the new column name. ## Sorting data + You can sort data in OpenRefine by clicking on the drop-down menu for the column you want to sort on, and choosing `Sort`. Once applied, locate the new "Sort" button at the top of the grid. -![Addition of Sort menu to OpenRefine grid after first sort command](../assets/img/sort-menu-highlight.png) +![](fig/sort-menu-highlight.png){alt='Addition of Sort menu to OpenRefine grid after first sort command'} -Unlike in Excel, 'Sorts' in OpenRefine are temporary - that is, if you remove the `Sort`, the data will go back to its original 'unordered' state. The 'Sort' drop-down menu lets you amend the existing sort (e.g., reverse the sort order), remove existing sorts, and/or make sorts permanent. To make a sort permanent, choose Reorder Rows Permanently from the Sort drop-down menu. +Unlike in Excel, 'Sorts' in OpenRefine are temporary - that is, if you remove the `Sort`, the data will go back to its original 'unordered' state. The 'Sort' drop-down menu lets you amend the existing sort (e.g., reverse the sort order), remove existing sorts, and/or make sorts permanent. To make a sort permanent, choose Reorder Rows Permanently from the Sort drop-down menu. You can sort on multiple columns at the same time by adding another sorted column (in the same way). ->##Separator ->Do not rush these last two sentences. Repeat them slowly after a pause and allow learners to explore a moment, even see the growing undo list -{:.instructor} +> \##Separator +> Do not rush these last two sentences. Repeat them slowly after a pause and allow learners to explore a moment, even see the growing undo list +> {:.instructor} + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- You can reorder, rename and remove columns in OpenRefine +- Sorting in OpenRefine always sorts all rows +- The original order of rows in OpenRefine is maintained during a sort until you use the option to Reorder Rows Permanently from the Sort drop-down menu + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/episodes/07-introduction-to-transformations.md b/episodes/07-introduction-to-transformations.md index 9fe50cbc..d5292080 100644 --- a/episodes/07-introduction-to-transformations.md +++ b/episodes/07-introduction-to-transformations.md @@ -1,47 +1,67 @@ --- -title: "Introduction to Transformations" +title: Introduction to Transformations teaching: 5 exercises: 5 -questions: -- "How do I use transformations to programmatically edit my data?" -- "What are the kind of transformations Open Refine supports?" -- "What is GREL?" -objectives: -- "Describe common transformations" -- "Explain GREL, the General Refine Expression Language" -keypoints: -- "Common transformations are available through the Menu option" --- +::::::::::::::::::::::::::::::::::::::: objectives + +- Describe common transformations +- Explain GREL, the General Refine Expression Language + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- How do I use transformations to programmatically edit my data? +- What are the kind of transformations Open Refine supports? +- What is GREL? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + ## Introducing Transformations Through facets, filters and clusters OpenRefine offers relatively straightforward ways of getting an overview of your data, and making changes where you want to standardise terms used to a common set of values. However, sometimes there will be changes you want to make to the data that cannot be achieved in this way. Such types of changes include: -* Splitting data that is in a single column into multiple columns (e.g. splitting an address into multiple parts) -* Standardising the format of data in a column without changing the values (e.g. removing punctuation or standardising a date format) -* Extracting a particular type of data from a longer text string (e.g. finding ISBNs in a bibliographic citation) +- Splitting data that is in a single column into multiple columns (e.g. splitting an address into multiple parts) +- Standardising the format of data in a column without changing the values (e.g. removing punctuation or standardising a date format) +- Extracting a particular type of data from a longer text string (e.g. finding ISBNs in a bibliographic citation) To support this type of activity OpenRefine supports 'Transformations' which are ways of manipulating data in columns. Transformations are normally written in a special language called 'GREL' (General Refine Expression Language). To some extent GREL expressions are similar to Excel Formula, although they tend to focus on text manipulations rather than numeric functions. Full documentation for the GREL is available at [https://docs.openrefine.org/manual/grelfunctions](https://docs.openrefine.org/manual/grelfunctions). This tutorial covers only a small subset of the commands available. ### Common transformations + Some transformations are used regularly and are accessible directly through menu options, without having to type them directly. Examples of some of these common transformations are given in the table below, with their 'GREL' equivalents. We'll see how to use the GREL version later in this lesson. -Common Transformation | Action | GREL expression ---------------------| ------------- | ------------- -Trim leading and trailing whitespace | Removes any 'whitespace' characters (e.g. spaces, tabs) from the start and end of the current value | ```value.trim()``` -To titlecase| Converts the current value to titlecase (i.e. each word starts with an uppercase character and all other characters are converted to lowercase) | ```value.toTitlecase()``` -To uppercase| Converts the current value to uppercase | ```value.toUppercase()``` -To lowercase| Converts the current value to lowercase | ```value.toLowercase()``` - ->## Correct Publisher data ->1. Create a text facet on the Publisher column ->2. Note that in the values there are two that look almost identical - why do these two values appear separately rather than as a single value? ->3. On the publisher column use the dropdown menu to select ```Edit cells->Common transforms->Collapse consecutive whitespace``` ->4. Look at the publisher facet now - has it changed? (if it hasn't changed try clicking the ```Refresh``` option to make sure it updates) -{: .checklist} +| Common Transformation | Action | GREL expression | +| ------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------- | --------------- | +| Trim leading and trailing whitespace | Removes any 'whitespace' characters (e.g. spaces, tabs) from the start and end of the current value | `value.trim()` | +| To titlecase | Converts the current value to titlecase (i.e. each word starts with an uppercase character and all other characters are converted to lowercase) | `value.toTitlecase()` | +| To uppercase | Converts the current value to uppercase | `value.toUppercase()` | +| To lowercase | Converts the current value to lowercase | `value.toLowercase()` | + +::::::::::::::::::::::::::::::::::::::: checklist + +## Correct Publisher data + +1. Create a text facet on the Publisher column +2. Note that in the values there are two that look almost identical - why do these two values appear separately rather than as a single value? +3. On the publisher column use the dropdown menu to select `Edit cells->Common transforms->Collapse consecutive whitespace` +4. Look at the publisher facet now - has it changed? (if it hasn't changed try clicking the `Refresh` option to make sure it updates) + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- Common transformations are available through the Menu option + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/episodes/08-writing-transformations.md b/episodes/08-writing-transformations.md index ac55040e..10713010 100644 --- a/episodes/08-writing-transformations.md +++ b/episodes/08-writing-transformations.md @@ -1,46 +1,66 @@ --- -title: "Writing Transformations" +title: Writing Transformations teaching: 5 exercises: 10 -questions: -- "Where do I write GREL expressions in the OpenRefine interface?" -- "How do I write a valid GREL expression?" -objectives: -- "Explain how to write one's own transformations using GREL" -keypoints: -- "You can alter data in OpenRefine based on specific instructions" -- "You can preview the results of your GREL expression" --- +::::::::::::::::::::::::::::::::::::::: objectives + +- Explain how to write one's own transformations using GREL + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- Where do I write GREL expressions in the OpenRefine interface? +- How do I write a valid GREL expression? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + ## Writing transformations -To start writing transformations, select the column on which you wish to perform a transformation and choose ```Edit cells->Transform…```. In the screen that displays you have a place to write a transformation (the 'Expression' box) and then the ability to Preview the effect the transformation would have on 10 rows of your data. +To start writing transformations, select the column on which you wish to perform a transformation and choose `Edit cells->Transform…`. In the screen that displays you have a place to write a transformation (the 'Expression' box) and then the ability to Preview the effect the transformation would have on 10 rows of your data. The transformation you type into the 'Expression' box has to be a valid GREL expression. The default expression is the word `value` by itself - which means the value that is currently in the column - that is: make no change. GREL functions are written by giving a value of some kind (a text string, a date, a number etc.) to a GREL function. Some GREL functions take additional parameters or options which control how the function works. GREL supports two types of syntax: -* ```value.function(options)``` -* ```function(value, options)``` +- `value.function(options)` +- `function(value, options)` Either is valid, and which is used is completely down to personal preference. In these notes the first syntax is used. Next to the 'Preview' option are options to view: -* 'History' - a list of transformations you've previously used with the option to reuse them immediately or to 'star' them for easy access -* 'Starred' - a list of transformations you've 'starred' via the 'History' view -* 'Help' - a list of all the GREL functions and brief information on how to use them - ->## Put titles into Title Case ->Use Facets and the GREL expression ```value.toTitlecase()``` to put the titles in Title Case ->1. Facet by publisher ->2. Select "Akshantala Enterprises" and "Society of Pharmaceutical Technocrats" ->3. To select multiple values in the facet use the ```include``` link that appears to the right of the facet ->4. See that the Titles for these are all in uppercase ->5. Click the dropdown menu on the Title column ->6. Choose ```Edit cells->Transform...``` ->7. In the Expression box type ```value.toTitlecase()``` ->8. In the Preview pane under value.toTitlecase() you can see what the effect of running this will be ->9. Click ```OK``` ->0. Find examples of titles that are still not correct, or have been incorrectly cased (abbreviations, species names, etc.) -{: .checklist} +- 'History' - a list of transformations you've previously used with the option to reuse them immediately or to 'star' them for easy access +- 'Starred' - a list of transformations you've 'starred' via the 'History' view +- 'Help' - a list of all the GREL functions and brief information on how to use them + +::::::::::::::::::::::::::::::::::::::: checklist + +## Put titles into Title Case + +Use Facets and the GREL expression `value.toTitlecase()` to put the titles in Title Case + +1. Facet by publisher +2. Select "Akshantala Enterprises" and "Society of Pharmaceutical Technocrats" +3. To select multiple values in the facet use the `include` link that appears to the right of the facet +4. See that the Titles for these are all in uppercase +5. Click the dropdown menu on the Title column +6. Choose `Edit cells->Transform...` +7. In the Expression box type `value.toTitlecase()` +8. In the Preview pane under value.toTitlecase() you can see what the effect of running this will be +9. Click `OK` +10. Find examples of titles that are still not correct, or have been incorrectly cased (abbreviations, species names, etc.) + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- You can alter data in OpenRefine based on specific instructions +- You can preview the results of your GREL expression + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/episodes/09-undo-and-redo.md b/episodes/09-undo-and-redo.md index 05895cb1..3e6026de 100644 --- a/episodes/09-undo-and-redo.md +++ b/episodes/09-undo-and-redo.md @@ -1,20 +1,26 @@ --- -title: "Transformations - Undo and Redo" +title: Transformations - Undo and Redo teaching: 5 exercises: 0 -questions: -- "How do the Undo and Redo features work?" -objectives: -- "Explain how to use Undo and Redo to retrace ones' steps" -keypoints: -- "You can use Undo and Redo to retrace ones' steps" -- "You can save and apply a set of steps to a new set of data using the 'Extract' and 'Apply' features" --- +::::::::::::::::::::::::::::::::::::::: objectives + +- Explain how to use Undo and Redo to retrace ones' steps + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- How do the Undo and Redo features work? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + ## Undo and Redo + OpenRefine lets you undo, and redo, any number of steps you have taken in cleaning the data. This means you can always try out transformations and 'undo' if you need to. The way OpenRefine records the steps you have taken even allows you to take the steps you've carried out on one data set, and apply it to another data set by a copy and paste operation. -The ```Undo``` and ```Redo``` options are accessed via the lefthand panel. +The `Undo` and `Redo` options are accessed via the lefthand panel. The Undo/Redo panel lists all the steps you've taken so far. To undo steps, click on the last step you want to preserve in the list and this will automatically undo all the changes made since that step. @@ -22,8 +28,17 @@ The remaining steps will continue to show in the list but greyed out, and you ca However, if you 'undo' a set of steps and then start doing new transformations, the greyed out steps will disappear and you will no longer have the option to 'redo' these steps. -If you wish to save a set of steps to be re-applied later, for instance, to a different project, you can click the ```Extract``` button. This gives you the option to select steps that you want to save, and extract the code for those steps in a format called ‘JSON’. You can copy the extracted JSON and save it as a plain text file (e.g. in Notepad). If you are using OpenRefine version 3.6.0 or later, you can also click the ```Export``` button in the "Extract operation history" window to open a save dialog and directly save the JSON instead of first copying it to a text file. +If you wish to save a set of steps to be re-applied later, for instance, to a different project, you can click the `Extract` button. This gives you the option to select steps that you want to save, and extract the code for those steps in a format called ‘JSON'. You can copy the extracted JSON and save it as a plain text file (e.g. in Notepad). If you are using OpenRefine version 3.6.0 or later, you can also click the `Export` button in the "Extract operation history" window to open a save dialog and directly save the JSON instead of first copying it to a text file. -To apply a set of steps you have copied or saved in this 'JSON' format use the ```Apply``` button and paste in the JSON. In this way you can share transformations between projects and with other people. +To apply a set of steps you have copied or saved in this 'JSON' format use the `Apply` button and paste in the JSON. In this way you can share transformations between projects and with other people. Undo/Redo data is stored with the Project and is saved automatically as you work, so next time you open the project, you can access your full history of steps you have carried out and undo/redo in exactly the same way. + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- You can use Undo and Redo to retrace ones' steps +- You can save and apply a set of steps to a new set of data using the 'Extract' and 'Apply' features + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/episodes/10-data-transformation.md b/episodes/10-data-transformation.md index e07b87da..59c26e83 100755 --- a/episodes/10-data-transformation.md +++ b/episodes/10-data-transformation.md @@ -1,99 +1,137 @@ --- -title: "Transforming Strings, Numbers, Dates and Booleans" +title: Transforming Strings, Numbers, Dates and Booleans teaching: 5 exercises: 15 -questions: -- "How do I use transformations to programmatically edit my data?" -- "How do I transform the various data types?" -objectives: -- "Name and describe 4 types of data - String, Number, Date and Boolean" -- "Transform dates for further analysis" -- "Use Boolean to identify information recorded in a different format" -- "Create and run transformations based on Boolean Values" -keypoints: -- "You can alter data in OpenRefine based on specific instructions" -- "You can expand the data editing functions that are built-in into OpenRefine by building your own" --- +::::::::::::::::::::::::::::::::::::::: objectives + +- Name and describe 4 types of data - String, Number, Date and Boolean +- Transform dates for further analysis +- Use Boolean to identify information recorded in a different format +- Create and run transformations based on Boolean Values + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- How do I use transformations to programmatically edit my data? +- How do I transform the various data types? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + ## Data types + Understanding data types can help you write a wider variety of transformations using GREL. ->## Data types in OpenRefine ->Every piece of data in OpenRefine has a 'type'. The most common 'type' is a 'string' - that is a piece of text. However there are other data types available and transformations let you convert data from one type to another where appropriate. The data types supported are: -> ->* String ->* Number ->* Date ->* Boolean ->* Array (covered in the next lesson) -{: .callout} +::::::::::::::::::::::::::::::::::::::::: callout + +## Data types in OpenRefine + +Every piece of data in OpenRefine has a 'type'. The most common 'type' is a 'string' - that is a piece of text. However there are other data types available and transformations let you convert data from one type to another where appropriate. The data types supported are: + +- String +- Number +- Date +- Boolean +- Array (covered in the next lesson) + + +:::::::::::::::::::::::::::::::::::::::::::::::::: ### Dates and Numbers + So far we've been looking only at 'String' type data. Much of the time it is possible to treat numbers and dates as strings. For example in the Date column we have the date of publication represented as a String. However, some operations and transformations only work on 'number' or 'date' typed data, such as sorting values in numeric or date order. To carry out these functions we need to convert the values to a date or number first. ->## Reformat the Date ->1. Make sure you remove all Facets and Filters ->2. On the Date column use the dropdown menu to select ```Edit cells -> Transform``` ->2. In the 'Expression' box type the GREL expression ```value.toDate("dd/MM/yyyy")``` and press OK. ->3. Note how the values are now displayed in green and follow a standard convention for their display format (ISO 8601) - this indicates they are now stored as date data types in OpenRefine. We can now carry out functions that are specific to Dates ->4. On the Date column dropdown select ```Edit column->Add column based on this column```. Using this function you can create a new column, while preserving the old column ->5. In the 'New column name' type "Formatted-Date" ->6. In the 'Expression' box type the GREL expression ```value.toString("dd MMMM yyyy") -{: .checklist} - ->## Specifying Date Formatting in GREL Expressions -> ->GREL allow us to specify date and time using ```pattern strings```, which are letters that have some specific representation in the function call. -> ->Pattern strings are case sensitive, therefore capital and lower case letters have a different meaning and usage. -{: .callout} +::::::::::::::::::::::::::::::::::::::: checklist + +## Reformat the Date + +1. Make sure you remove all Facets and Filters +2. On the Date column use the dropdown menu to select `Edit cells -> Transform` +3. In the 'Expression' box type the GREL expression `value.toDate("dd/MM/yyyy")` and press OK. +4. Note how the values are now displayed in green and follow a standard convention for their display format (ISO 8601) - this indicates they are now stored as date data types in OpenRefine. We can now carry out functions that are specific to Dates +5. On the Date column dropdown select `Edit column->Add column based on this column`. Using this function you can create a new column, while preserving the old column +6. In the 'New column name' type "Formatted-Date" +7. In the 'Expression' box type the GREL expression \`\`\`value.toString("dd MMMM yyyy") + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::::::: callout + +## Specifying Date Formatting in GREL Expressions + +GREL allow us to specify date and time using `pattern strings`, which are letters that have some specific representation in the function call. + +Pattern strings are case sensitive, therefore capital and lower case letters have a different meaning and usage. + + +:::::::::::::::::::::::::::::::::::::::::::::::::: The table below shows letters related to date and time representation. -| Letter| Date or Time Representation| -| ------------- |:-------------:| -| `y` | Year| -| `M` | Month in year| -| `D` | Day in year| -| `d` | Day in month| -| `F` | Day of week in month| -| `E` | Day name in week| -| `u` | Day number of week| -| `a` | AM/PM marker| +| Letter | Date or Time Representation | +| --------------------------- | :-------------------------: | +| `y` | Year | +| `M` | Month in year | +| `D` | Day in year | +| `d` | Day in month | +| `F` | Day of week in month | +| `E` | Day name in week | +| `u` | Day number of week | +| `a` | AM/PM marker | The table below presents examples on how to use the patterns as input and the obtained output. -| Date and Time Pattern Input| Output| -| ------------- |:-------------:| -| `"yyyy-MM-dd"`| 2022-06-05| -| `"dd MMM yyyy"`| 05 Jun 2022| -| `"EEE, MMM d, ''yy"`| Mon, Jun 5, '22| -| `"yyyy.MMMM.dd hh:mm a"`| 2022.June.05 12:10 PM| -| `"EEE, d MMM yyyy HH:mm:ss"`| Mon, 5 Jun 2022 12:10:10| +| Date and Time Pattern Input | Output | +| --------------------------- | :-------------------------: | +| `"yyyy-MM-dd"` | 2022-06-05 | +| `"dd MMM yyyy"` | 05 Jun 2022 | +| `"EEE, MMM d, ''yy"` | Mon, Jun 5, '22 | +| `"yyyy.MMMM.dd hh:mm a"` | 2022\.June.05 12:10 PM | +| `"EEE, d MMM yyyy HH:mm:ss"` | Mon, 5 Jun 2022 12:10:10 | For a more detailed explanation checkout [OpenRefine Documentation](https://docs.openrefine.org/manual/grelfunctions#date-functions). - ### Booleans + A 'Boolean' is a binary value that can either be 'true' or 'false'. Boolean values can be used directly in OpenRefine cells, but are more often used in transformations as part of a GREL expression. For example the GREL expression + ``` value.contains("test") ``` + generates a boolean value of either 'true' or 'false' depending on whether the current value in the cell contains the text 'test' anywhere. -Such tests can be combined with other GREL expressions to create more complex transformations. For example, to carry out a further transformation only if a test is successful. The GREL transformation ```if(value.contains("test"),"Test data",value)``` replaces a cell value with the words "Test data" only *if* the value in the cell contains the string "test" anywhere. - ->## Find Reversed Author Names ->In this exercise we are going to use the Boolean data type. ->If you look at the Authors column, you can see that most of the author names are written with the personal name first. However, a few have been reversed to put the family name first. -> ->We can do a crude test for reversed author names by looking for those that contain a comma: -> ->1. Make sure you have already split the author names into individual cells using ```Edit cells->Split multi-valued cells``` (you should have done this in exercise 5) ->2. On the Authors column, use the dropdown menu and select ```Facet->Custom text facet...``` ->3. The Custom text facet function allows you to write GREL functions to create a facet ->4. In the Expression box type ```value.contains(",")``` ->* Click ```OK``` ->* Since the 'contains' function outputs a Boolean value, you should see a facet that contains 'false' and 'true'. These represent the outcome of the expression, i.e. true = values containing a comma; false = values not containing a comma ->* In order to change the names to personal name first order, see the Arrays lesson. -{: .checklist} +Such tests can be combined with other GREL expressions to create more complex transformations. For example, to carry out a further transformation only if a test is successful. The GREL transformation `if(value.contains("test"),"Test data",value)` replaces a cell value with the words "Test data" only *if* the value in the cell contains the string "test" anywhere. + +::::::::::::::::::::::::::::::::::::::: checklist + +## Find Reversed Author Names + +In this exercise we are going to use the Boolean data type. +If you look at the Authors column, you can see that most of the author names are written with the personal name first. However, a few have been reversed to put the family name first. + +We can do a crude test for reversed author names by looking for those that contain a comma: + +1. Make sure you have already split the author names into individual cells using `Edit cells->Split multi-valued cells` (you should have done this in exercise 5) +2. On the Authors column, use the dropdown menu and select `Facet->Custom text facet...` +3. The Custom text facet function allows you to write GREL functions to create a facet +4. In the Expression box type `value.contains(",")` + +- Click `OK` +- Since the 'contains' function outputs a Boolean value, you should see a facet that contains 'false' and 'true'. These represent the outcome of the expression, i.e. true = values containing a comma; false = values not containing a comma +- In order to change the names to personal name first order, see the Arrays lesson. + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- You can alter data in OpenRefine based on specific instructions +- You can expand the data editing functions that are built-in into OpenRefine by building your own + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/episodes/11-using-arrays-transformations.md b/episodes/11-using-arrays-transformations.md index bb9e90b5..59ae2ce4 100644 --- a/episodes/11-using-arrays-transformations.md +++ b/episodes/11-using-arrays-transformations.md @@ -1,23 +1,30 @@ --- -title: "Transformations - Handling Arrays" +title: Transformations - Handling Arrays teaching: 5 exercises: 15 -questions: -- "How do I use Arrays in data transformation?" -objectives: -- "Understand the purpose of Arrays in OpenRefine" -- "Use arrays as part of transformations in GREL" -keypoints: -- "Arrays cannot appear directly in an OpenRefine cell" -- "Arrays can be used in many ways using GREL expressions" --- +::::::::::::::::::::::::::::::::::::::: objectives + +- Understand the purpose of Arrays in OpenRefine +- Use arrays as part of transformations in GREL + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- How do I use Arrays in data transformation? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + ## Preview -The following example is chosen to demonstrate how to go from a list with duplicated values to a list with each value just once by using an array in a transformation. ->## Caution ->Ask the students what transformation means to them currently. Many may only know it from Excel to convert columns into rows or vice versa. Discuss how in OpenRefine, transformation is specifically the working window--these values are neither stored nor displayed in the cells or output. -{:.instructor} +The following example is chosen to demonstrate how to go from a list with duplicated values to a list with each value just once by using an array in a transformation. + +> ## Caution +> +> Ask the students what transformation means to them currently. Many may only know it from Excel to convert columns into rows or vice versa. Discuss how in OpenRefine, transformation is specifically the working window--these values are neither stored nor displayed in the cells or output. +> {:.instructor} It does this using a function called uniques() which can be used to remove duplicates from an array. In this example we start with a list of subject words: @@ -26,6 +33,7 @@ It does this using a function called uniques() which can be used to remove dupli Examining this by eye we can see it contains "crystal structure" twice. If we assume that each cell in the subject column might have duplicates in it, and in each case the subject word/phrase that is duplicated could be different, then it's not practical to "fix" this problem (remove the duplicates from each) by find and replace. However, we can do it using an array. The lesson's goal is: an array as something you can manipulate. To remove the repetition we show how to do a GREL transformation like: + ``` value.split("|").uniques().join("|") ``` @@ -39,6 +47,7 @@ In total this transformation does three steps: Let us now move from a list with duplicated values to a list with each value just once using an array in transformation. ## Arrays + An 'Array' is a data type (as mentioned in the previous lesson) which can contain a list of values. In OpenRefine an array is represented by the use of square brackets containing a list of values separated by commas. For example: @@ -46,66 +55,89 @@ For example: - an array containing a list of strings (in this case subject keywords or phrases) could look like: `["crystal structure", "clozapinium", "crystal structure", "molecular configuration", "hydrogen bonding", "supramolecular assembly", "Chemistry", "QD1-999"]` - an array containing a list of numbers could look like: `[1, 2, 3, 4]` -Arrays can be sorted, de-duplicated, and manipulated in other ways in GREL expressions, but cannot be stored directly in an OpenRefine cell. Arrays in OpenRefine are usually the result of a transformation written with GREL. For example the ```split``` function takes a string, and changes it into an array based on a 'separator'. For example if a cell has the value: +Arrays can be sorted, de-duplicated, and manipulated in other ways in GREL expressions, but cannot be stored directly in an OpenRefine cell. Arrays in OpenRefine are usually the result of a transformation written with GREL. For example the `split` function takes a string, and changes it into an array based on a 'separator'. For example if a cell has the value: `"crystal structure|clozapinium|crystal structure|molecular configuration|hydrogen bonding|supramolecular assembly|Chemistry|QD1-999"` -This can be transformed into an array using the ```split``` function specifying the pipe character ( | ) as the separating character. Recall the cautionary note about separator choice from [Working with Data](https://librarycarpentry.org/lc-open-refine/03-working-with-data/index.html). +This can be transformed into an array using the `split` function specifying the pipe character ( | ) as the separating character. Recall the cautionary note about separator choice from [Working with Data](https://librarycarpentry.org/lc-open-refine/03-working-with-data/index.html). . + ``` value.split("|") ``` + This would create the array containing a list of subject headings, separated by a pipe character | (as in the first example above). In the transformation preview the array will display as a list of comma separated values in double quotes, with the whole array surrounded by square brackets. -This subject string can be found for the title "The crystal structures of three clozapinium salts: different molecular configurations, and supramolecular assembly in one, two and three dimensions" in the original project. +This subject string can be found for the title "The crystal structures of three clozapinium salts: different molecular configurations, and supramolecular assembly in one, two and three dimensions" in the original project. + +This can be combined with array operations like `uniques`. For example, assuming the cell contains the same value as above, then the function -This can be combined with array operations like ```uniques```. For example, assuming the cell contains the same value as above, then the function ``` value.split("|").uniques() ``` + would result in the following array: ["crystal structure", "clozapinium", "molecular configuration", "hydrogen bonding", "supramolecular assembly", "Chemistry", "QD1-999"] -Compared to the first example, now the second 'crystal structure' has been removed. +Compared to the first example, now the second 'crystal structure' has been removed. You can extract a specific item from the array using the square bracket notation and number for position in sequence: + ``` value.split("|")[0] ``` -would result in the string: + +would result in the string: "crystal structure" You can also join arrays together to make a 'String'. The GREL expression would look like + ``` value.split("|").uniques().join("|") ``` + Taking the same example again, this would result in a string with the subjects in alphabetical order, listed with commas between each subject. > ## Caution ->Recall previous discussion of dangers of changing separators--especially commas. Possible question to pose: Which subject would be broken if a hyphen were used as a separator? -{:.instructor} - ->## Reverse author names ->You may already have done the boolean exercise and have a facet containing the names in personal name first order. In this case, select the 'true' facet and start with the step **"9. On the ```Authors``` column use..."** below. -> ->In this exercise we are going to use both the Boolean and Array data types. ->If you look at the Authors column, you can see that most of the author names are written in personal name first order. However, a few have been reversed to put the family name first. -> ->We can do a crude test for reversed author names by looking for those that contain a comma: -> ->1. Make sure you have already split the author names into individual cells using ```Edit cells->Split multi-valued cells``` (you should have done this in the Clustering lesson) ->2. On the Authors column, use the dropdown menu and select ```Facet->Custom text facet...``` ->3. The ```Custom text``` facet function allows you to write GREL functions to create a facet ->4. In the Expression box type ```value.contains(",")``` ->5. Click ```OK``` ->6. Since the ```contains``` function outputs a Boolean value, you should see a facet that contains 'false' and 'true'. These represent the outcome of the expression, i.e. true = values containing a comma; false = values not containing a comma ->7. In this facet select 'true' to narrow down to the author names that contain a comma ->8. Now we have narrowed down to the lines with a comma in a name, we can use the GREL ```split``` function. This is different to the ```Split multi-valued cells``` operation we have previously used as it allows us to manipulate the content of a cell, rather than create new cells. ->9. On the ```Authors``` column use the dropdown menu and select ```Edit cells->Transform ``` ->10. In the Expression box type ```value.split(", ")``` (make sure to include a space after the comma inside the split expression to avoid extra spaces in your author name later). ->11. See how this creates an array with two members in each row in the Preview column ->12. To get the author name in personal name first order you can reverse the array and join it back together with a space to create the string you need: ->13. In the Expression box, add to the existing expression until it reads ```value.split(", ").reverse().join(" ")``` ->14. In the Preview view you should be able see this has reversed the array, and joined it back into a string ->15. Click ```OK``` -{: .checklist} +> +> Recall previous discussion of dangers of changing separators--especially commas. Possible question to pose: Which subject would be broken if a hyphen were used as a separator? +> {:.instructor} + +::::::::::::::::::::::::::::::::::::::: checklist + +## Reverse author names + +You may already have done the boolean exercise and have a facet containing the names in personal name first order. In this case, select the 'true' facet and start with the step **"9. On the `Authors` column use..."** below. + +In this exercise we are going to use both the Boolean and Array data types. +If you look at the Authors column, you can see that most of the author names are written in personal name first order. However, a few have been reversed to put the family name first. + +We can do a crude test for reversed author names by looking for those that contain a comma: + +1. Make sure you have already split the author names into individual cells using `Edit cells->Split multi-valued cells` (you should have done this in the Clustering lesson) +2. On the Authors column, use the dropdown menu and select `Facet->Custom text facet...` +3. The `Custom text` facet function allows you to write GREL functions to create a facet +4. In the Expression box type `value.contains(",")` +5. Click `OK` +6. Since the `contains` function outputs a Boolean value, you should see a facet that contains 'false' and 'true'. These represent the outcome of the expression, i.e. true = values containing a comma; false = values not containing a comma +7. In this facet select 'true' to narrow down to the author names that contain a comma +8. Now we have narrowed down to the lines with a comma in a name, we can use the GREL `split` function. This is different to the `Split multi-valued cells` operation we have previously used as it allows us to manipulate the content of a cell, rather than create new cells. +9. On the `Authors` column use the dropdown menu and select `Edit cells->Transform ` +10. In the Expression box type `value.split(", ")` (make sure to include a space after the comma inside the split expression to avoid extra spaces in your author name later). +11. See how this creates an array with two members in each row in the Preview column +12. To get the author name in personal name first order you can reverse the array and join it back together with a space to create the string you need: +13. In the Expression box, add to the existing expression until it reads `value.split(", ").reverse().join(" ")` +14. In the Preview view you should be able see this has reversed the array, and joined it back into a string +15. Click `OK` + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- Arrays cannot appear directly in an OpenRefine cell +- Arrays can be used in many ways using GREL expressions + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/episodes/12-export-transformation.md b/episodes/12-export-transformation.md index 85faeabc..72440c8c 100644 --- a/episodes/12-export-transformation.md +++ b/episodes/12-export-transformation.md @@ -1,24 +1,41 @@ --- -title: "Exporting data" +title: Exporting data teaching: 5 exercises: 0 -questions: -- "How do I export data from OpenRefine?" -objectives: -- "Explain how to export data in different formats from OpenRefine" -keypoints: -- "You can export your data in a variety of formats" --- +::::::::::::::::::::::::::::::::::::::: objectives + +- Explain how to export data in different formats from OpenRefine + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- How do I export data from OpenRefine? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + ## Note about OpenRefine + All the edits you make to your data using OpenRefine are being stored inside the OpenRefine program and are not being saved to your original file. That's why OpenRefine uses the terms "import" and "export" to talk about moving your data in and out of the OpenRefine interface. Therefore, in order to save your work into a file format you can view in other programs or share with others, you need to export your data. ## Exporting data -Once you have finished working with a data set in OpenRefine you may wish to export it. The export options are accessed through the ```Export``` button at the top right of the OpenRefine interface. + +Once you have finished working with a data set in OpenRefine you may wish to export it. The export options are accessed through the `Export` button at the top right of the OpenRefine interface. Export formats support include HTML, Excel and comma- and tab-separated value (csv and tsv). You can also write a custom export, selecting to export specific fields, adding a header or footer and specifying the exact format. ## Exporting a portion of your data -You can also export a portion of your data by using facets or filters to select a portion of your data. With only those rows selected, you can select the export format and your resulting file will only include the select rows. -*Note well:* It's easy to export only a portion of your data by accident, so make sure you look at the top left to ensure all rows are being displayed when you want to do a full export. +You can also export a portion of your data by using facets or filters to select a portion of your data. With only those rows selected, you can select the export format and your resulting file will only include the select rows. + +*Note well:* It's easy to export only a portion of your data by accident, so make sure you look at the top left to ensure all rows are being displayed when you want to do a full export. + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- You can export your data in a variety of formats + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/episodes/13-looking-up-data.md b/episodes/13-looking-up-data.md index d795f381..b221d828 100644 --- a/episodes/13-looking-up-data.md +++ b/episodes/13-looking-up-data.md @@ -1,22 +1,26 @@ --- -title: "Looking Up Data" +title: Looking Up Data teaching: 20 exercises: 10 -questions: -- "How do I fetch data from an Application Programming Interface (API) to be used in OpenRefine?" -- "How do I reconcile my data by comparing it to authoritative datasets" -- "How do I install extensions for OpenRefine" -objectives: -- "Use URLs to fetch data from the web based on columns in an OpenRefine project" -- "Add columns to parse JSON data returned by web services" -- "Understand how Reconciliation services are used to validate data" -- "Add functionality using OpenRefine extensions" -keypoints: -- "OpenRefine can look up custom URLs to fetch data based on what's in an OpenRefine project" -- "Such API calls can be custom built, or one can use existing Reconciliation services to enrich data" -- "OpenRefine can be further enhanced by installing extensions" --- +::::::::::::::::::::::::::::::::::::::: objectives + +- Use URLs to fetch data from the web based on columns in an OpenRefine project +- Add columns to parse JSON data returned by web services +- Understand how Reconciliation services are used to validate data +- Add functionality using OpenRefine extensions + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- How do I fetch data from an Application Programming Interface (API) to be used in OpenRefine? +- How do I reconcile my data by comparing it to authoritative datasets +- How do I install extensions for OpenRefine + +:::::::::::::::::::::::::::::::::::::::::::::::::: + ## Looking up data from a URL OpenRefine can retrieve data from URLs. This can be used in various ways, including looking up additional information from a remote service, based on information in your OpenRefine data. @@ -25,64 +29,70 @@ As an example, you can look up names against the Virtual International Authority Typically this is a two step process, firstly a step to retrieve data from a remote service, and secondly to extract the relevant information from the data you have retrieved. -To retrieve data from an external source, use the drop down menu at any column heading and select ‘Edit column->Add column by fetching URLs’. +To retrieve data from an external source, use the drop down menu at any column heading and select ‘Edit column->Add column by fetching URLs'. This will prompt you for a GREL expression to create a URL. Usually this would be a URL that uses existing values in your data to build a query. When the query runs OpenRefine will request each URL (for each line) and retrieve whatever data is returned (this may often be structured data, but could be HTML). The data retrieved will be stored in a cell in the new column that has been added to the project. You can then use OpenRefine transformations to extract relevant information from the data that has been retrieved. Two specific OpenRefine functions used for this are: -* parseHtml() -* parseJson() +- parseHtml() +- parseJson() The 'parseHtml()' function can also be used to extract data from XML. The next exercise demonstrates this two stage process in full. ->## Retrieving journal details from CrossRef via ISSN ->Because retrieving data from external URLs takes time, this exercise targets a single line in the data. In reality you would want to run this over many rows (and probably go and do something else while it ran). -> ->* Select a single row from the data set which contains an ISSN by: -> * Clicking the star icon for the relevant row in the first column -> * Facet by Star -> * Choose the single row ->* In the ISSN column use the dropdown menu to choose 'Edit column->Add column by fetching URLs' ->* Give the column a name e.g. "Journal-Details" ->* In the expression box you need to write some GREL where the output of the expression is a URL which can be used to retrieve data (the format of the data could be HTML, XML, JSON, or some other text format) -> ->In this case we are going to use the CrossRef API: [https://api.crossref.org/](https://api.crossref.org/). Read more about the CrossRef service: [https://crossref.org](https://crossref.org). Note that API providers may impose rate limits or have other requirements for using their data, so it's important to check the site's documentation. To comply with API rate limits, use the Throttle Delay setting to specify the number of milliseconds between URL requests. CrossRef, for instance, [asks users](https://www.crossref.org/documentation/retrieve-metadata/rest-api/tips-for-using-the-crossref-rest-api/#pick-the-right-service-level) to "specify a User-Agent header that properly identifies your script or tool and that provides a means of contacting you via email using 'mailto:'." User-agent headers provide administrators with user information that facilitates better administration and moderation of the API, and it is generally good etiquette to include a header with any API request. -> ->To edit your User-Agent header: ->* Click 'Show' (next to 'HTTP headers to be used when fetching URLs'). Note that OpenRefine has already populated the 'User-Agent' field with information about the version of OpenRefine you are using; it should look similar to ``` OpenRefine 3... [...]``` (the information following ```OpenRefine``` will depend on the version of OpenRefine you are using). ->* At the end of the existing text, add ```; mailto:address@library.edu```, using your own email address. The full User-Agent field should now be similar to ``` OpenRefine 3... [...]; mailto:address@library.edu``` but reflect your version information and email address. -> ->The syntax for requesting journal information from CrossRef is ```https://api.crossref.org/journals/{ISSN}``` where {ISSN} is replaced with the ISSN of the journal -> ->* In the expression box type the GREL ```"https://api.crossref.org/journals/"+value``` -> ->At this point, your screen should be similar to this: ->![Add column by fetching URLs screen capture](../assets/img/openrefine_add_columns_by_url.png) -> ->* Click 'OK' -> -> ->You should see a message at the top on the OpenRefine screen indicating it is fetching some data, with progress showing the percentage of the proportion of rows of data successfully being fetched. Wait for this to complete. Fetching data for a single row should take only ten seconds or so, but fetching data for all rows will take longer. You can speed this up by modifying the "Throttle Delay" setting in the 'Add column by fetching URLs' dialog which controls the delay between each URL request made by OpenRefine. This is defaulted to a rather large 5000 milliseconds (5 seconds). -> ->At this point you should have a new cell containing a long text string in a format called 'JSON' (this stands for JavaScript Object Notation, although very rarely spelt out in full). -> ->OpenRefine has a function for extracting data from JSON (sometimes referred to as 'parsing' the JSON). The 'parseJson' function is explained in more detail at [https://docs.openrefine.org/manual/grelfunctions/#format-based-functions-json-html-xml](https://docs.openrefine.org/manual/grelfunctions/#format-based-functions-json-html-xml). -> ->* In the new column you've just added use the dropdown menu to access 'Edit column->Add column based on this column' ->* Add a name for the new column e.g. "Journal-Title" ->* In the Expression box type the GREL ```value.parseJson().message.title``` ->* You should see in the Preview the Journal title displays -> ->The reason for using 'Add column based on this column' is that this allows you to retain the full JSON and extract further data from it if you need to. If you only wanted the title and did not need any other information from the JSON you could use 'Edit cells->Transform...' with the same GREL expression. -{: .challenge} +::::::::::::::::::::::::::::::::::::::: challenge + +## Retrieving journal details from CrossRef via ISSN + +Because retrieving data from external URLs takes time, this exercise targets a single line in the data. In reality you would want to run this over many rows (and probably go and do something else while it ran). + +- Select a single row from the data set which contains an ISSN by: + - Clicking the star icon for the relevant row in the first column + - Facet by Star + - Choose the single row +- In the ISSN column use the dropdown menu to choose 'Edit column->Add column by fetching URLs' +- Give the column a name e.g. "Journal-Details" +- In the expression box you need to write some GREL where the output of the expression is a URL which can be used to retrieve data (the format of the data could be HTML, XML, JSON, or some other text format) + +In this case we are going to use the CrossRef API: [https://api.crossref.org/](https://api.crossref.org/). Read more about the CrossRef service: [https://crossref.org](https://crossref.org). Note that API providers may impose rate limits or have other requirements for using their data, so it's important to check the site's documentation. To comply with API rate limits, use the Throttle Delay setting to specify the number of milliseconds between URL requests. CrossRef, for instance, [asks users](https://www.crossref.org/documentation/retrieve-metadata/rest-api/tips-for-using-the-crossref-rest-api/#pick-the-right-service-level) to "specify a User-Agent header that properly identifies your script or tool and that provides a means of contacting you via email using 'mailto:'." User-agent headers provide administrators with user information that facilitates better administration and moderation of the API, and it is generally good etiquette to include a header with any API request. + +To edit your User-Agent header: + +- Click 'Show' (next to 'HTTP headers to be used when fetching URLs'). Note that OpenRefine has already populated the 'User-Agent' field with information about the version of OpenRefine you are using; it should look similar to ` OpenRefine 3... [...]` (the information following `OpenRefine` will depend on the version of OpenRefine you are using). +- At the end of the existing text, add `; mailto:address@library.edu`, using your own email address. The full User-Agent field should now be similar to ` OpenRefine 3... [...]; mailto:address@library.edu` but reflect your version information and email address. + +The syntax for requesting journal information from CrossRef is `https://api.crossref.org/journals/{ISSN}` where {ISSN} is replaced with the ISSN of the journal + +- In the expression box type the GREL `"https://api.crossref.org/journals/"+value` + +At this point, your screen should be similar to this: +![](fig/openrefine_add_columns_by_url.png){alt='Add column by fetching URLs screen capture'} + +- Click 'OK' + +You should see a message at the top on the OpenRefine screen indicating it is fetching some data, with progress showing the percentage of the proportion of rows of data successfully being fetched. Wait for this to complete. Fetching data for a single row should take only ten seconds or so, but fetching data for all rows will take longer. You can speed this up by modifying the "Throttle Delay" setting in the 'Add column by fetching URLs' dialog which controls the delay between each URL request made by OpenRefine. This is defaulted to a rather large 5000 milliseconds (5 seconds). + +At this point you should have a new cell containing a long text string in a format called 'JSON' (this stands for JavaScript Object Notation, although very rarely spelt out in full). + +OpenRefine has a function for extracting data from JSON (sometimes referred to as 'parsing' the JSON). The 'parseJson' function is explained in more detail at [https://docs.openrefine.org/manual/grelfunctions/#format-based-functions-json-html-xml](https://docs.openrefine.org/manual/grelfunctions/#format-based-functions-json-html-xml). + +- In the new column you've just added use the dropdown menu to access 'Edit column->Add column based on this column' +- Add a name for the new column e.g. "Journal-Title" +- In the Expression box type the GREL `value.parseJson().message.title` +- You should see in the Preview the Journal title displays + +The reason for using 'Add column based on this column' is that this allows you to retain the full JSON and extract further data from it if you need to. If you only wanted the title and did not need any other information from the JSON you could use 'Edit cells->Transform...' with the same GREL expression. + + +:::::::::::::::::::::::::::::::::::::::::::::::::: ## Reconciliation services + Reconciliation services allow you to lookup terms from your data in OpenRefine against external services, and use values from the external services in your data. The official User Manual provides [detailed information about the reconciliation feature](https://docs.openrefine.org/manual/reconciling). -Reconciliation services can be more sophisticated and often quicker than using the method described above to retrieve data from a URL. However, to use the ‘Reconciliation’ function in OpenRefine requires the external resource to support the necessary service for OpenRefine to work with, which means unless the service you wish to use supports such a service you cannot use the ‘Reconciliation’ approach. +Reconciliation services can be more sophisticated and often quicker than using the method described above to retrieve data from a URL. However, to use the ‘Reconciliation' function in OpenRefine requires the external resource to support the necessary service for OpenRefine to work with, which means unless the service you wish to use supports such a service you cannot use the ‘Reconciliation' approach. There are a few services where you can find an OpenRefine Reconciliation option available. For example Wikidata has a reconciliation service at [https://wikidata.reconci.link/](https://wikidata.reconci.link/). @@ -90,68 +100,75 @@ In other cases people have built reconciliation applications for a specific serv One of the most common ways of using the reconciliation option in OpenRefine is with an extension (see below for more on extensions to OpenRefine) which can use linked data sources for reconciliation. The RDF extension by Stuart Kenny can be downloaded from [https://github.com/stkenny/grefine-rdf-extension/releases](https://github.com/stkenny/grefine-rdf-extension/releases). -Other extensions are available to do reconciliation against local data such as csv files (see [http://okfnlabs.org/reconcile-csv/](http://okfnlabs.org/reconcile-csv/)) and maintained lists of values (see [http://okfnlabs.org/projects/nomenklatura/index.html](http://okfnlabs.org/projects/nomenklatura/index.html)). +Other extensions are available to do reconciliation against local data such as csv files (see [http://okfnlabs.org/reconcile-csv/](https://okfnlabs.org/reconcile-csv/)) and maintained lists of values (see [http://okfnlabs.org/projects/nomenklatura/index.html](https://okfnlabs.org/projects/nomenklatura/index.html)). For more information on using Reconciliation services see [https://docs.openrefine.org/manual/reconciling](https://docs.openrefine.org/manual/reconciling). ->## Reconcile Publisher names with VIAF IDs ->In this exercise you are going to use the VIAF Reconciliation service written by [Jeff Chiu](https://twitter.com/absolutelyjeff). Jeff offers two ways of using the reconciliation service - either via a public service he runs at [http://refine.codefork.com/](http://refine.codefork.com/), or by installing and running the service locally using the instructions at [https://github.com/codeforkjeff/conciliator](https://github.com/codeforkjeff/conciliator). -> ->If you are going to do a lot of reconciliation, please install and run your own local reconciliation service following the instructions at [https://github.com/codeforkjeff/conciliator](https://github.com/codeforkjeff/conciliator#running-conciliator-on-your-own-computer). -> ->Once you have chosen which service you are going to use: -> ->* In the Publisher column use the dropdown menu to choose 'Reconcile->Start Reconciling' ->* If this is the first time you've used this particular reconciliation service, you'll need to add the details of the service now -> * Click 'Add Standard Service...' and in the dialogue that appears enter: -> * "https://refine.codefork.com/reconcile/viaf" for Jeff's public service -> * "http://localhost:8080/reconcile/viaf" if you are running the service locally ->* You should now see a heading in the list on the left hand side of the Reconciliation dialogue called "VIAF" ->* Click on this to choose to use this reconciliation service ->* In the middle box in the reconciliation dialogue you may get asked what type of 'entity' you want to reconcile to - that is, what type of thing are you looking for. The list will vary depending on what reconciliation service you are using. -> * In this case choose "Corporate Name" (it seems like the VIAF Reconciliation Service is slightly intelligent about this and will only offer options that are relevant) ->* In the box on the righthand side of the reconciliation dialogue you can choose if other columns are used to help the reconciliation service make a match - however it is sometimes hard to tell what use (if any) the reconciliation service makes of these additional columns ->* At the bottom of the reconciliation dialogue there is the option to "Auto-match candidates with high confidence". This can be a time saver, but in this case you are going to uncheck it, so you can see the results before a match is made ->* Now click 'Start Reconciling' -> ->Reconciliation is an operation that can take a little time if you have many values to look up. However, in this case there are only 6 publishers to check, so it should work quite quickly. -> ->Once the reconciliation has completed two Facets should be created automatically: ->* Publisher: Judgement ->* Publisher: best candidate's score -> ->These are two of several specific reconciliation facets and actions that you can get from the 'Reconcile' menu (from the column drop down menu). -> ->* Close the 'Publisher: best candidate's score' facet, but leave the 'Publisher: Judgement' facet open -> ->If you look at the Publisher column, you should see some cells have found one or more matches - the potential matches are shown in a list in each cell. Next to each potential match there is a 'tick' and a 'double tick'. To accept a reconciliation match you can use the 'tick' options in cells. The 'tick' accepts the match for the single cell, the 'double tick' accepts the match for all identical cells. -> ->* Create a text facet on the Publisher column ->* Choose 'International Union of Crystallography' -> ->In the Publisher column you should be able to see the various potential matches. Clicking on a match will take you to the VIAF page for that entity. -> ->* Click a 'double tick' in one of the Publisher column cells for the option "International Union of Crystallography" ->* This will accept this as a match for all cells - you should see the other options all disappear ->* Check the 'Publisher: Judgement' facet. This should now show that 858 items are 'matched' (if this does not update, try refreshing the facets) -> ->We could do these one by one, but if we are confident with the matches, there is an option to accept all: -> ->* Remove all filters/facets from the project so all rows display ->* In the Publisher column use the dropdown menu to choose 'Reconcile->Actions->Match each cell to its best candidate' -> ->There are two things that reconciliation can do for you. Firstly it gets a standard form of the name or label for the entity. Secondly it gets an ID for the entity - in this case a VIAF id. This is hidden in the default view, but can be extracted: -> ->* In the Publisher column use the dropdown menu to choose 'Edit column->Add column based on this column...' ->* Give the column the name 'VIAF-ID' ->* In the GREL expression box type ```cell.recon.match.id``` ->* This will create a new column that contains the VIAF ID for the matched entity -{: .challenge} - -## Using the ‘cross’ function to lookup data in other OpenRefine projects -As well as looking up data in external systems using the methods described above, it is also possible to look up data in other OpenRefine projects on the same computer. This is done using the ‘cross’ function. - -The ‘cross’ function takes a value from the OpenRefine project you are working on, and looks for that value in a column in another OpenRefine project. If it finds one or more matching rows in the second OpenRefine project, it returns an array containing the rows that it has matched. +::::::::::::::::::::::::::::::::::::::: challenge + +## Reconcile Publisher names with VIAF IDs + +In this exercise you are going to use the VIAF Reconciliation service written by [Jeff Chiu](https://twitter.com/absolutelyjeff). Jeff offers two ways of using the reconciliation service - either via a public service he runs at [http://refine.codefork.com/](https://refine.codefork.com/), or by installing and running the service locally using the instructions at [https://github.com/codeforkjeff/conciliator](https://github.com/codeforkjeff/conciliator). + +If you are going to do a lot of reconciliation, please install and run your own local reconciliation service following the instructions at [https://github.com/codeforkjeff/conciliator](https://github.com/codeforkjeff/conciliator#running-conciliator-on-your-own-computer). + +Once you have chosen which service you are going to use: + +- In the Publisher column use the dropdown menu to choose 'Reconcile->Start Reconciling' +- If this is the first time you've used this particular reconciliation service, you'll need to add the details of the service now + - Click 'Add Standard Service...' and in the dialogue that appears enter: + - "[https://refine.codefork.com/reconcile/viaf](https://refine.codefork.com/reconcile/viaf)" for Jeff's public service + - "[http://localhost:8080/reconcile/viaf](https://localhost:8080/reconcile/viaf)" if you are running the service locally +- You should now see a heading in the list on the left hand side of the Reconciliation dialogue called "VIAF" +- Click on this to choose to use this reconciliation service +- In the middle box in the reconciliation dialogue you may get asked what type of 'entity' you want to reconcile to - that is, what type of thing are you looking for. The list will vary depending on what reconciliation service you are using. + - In this case choose "Corporate Name" (it seems like the VIAF Reconciliation Service is slightly intelligent about this and will only offer options that are relevant) +- In the box on the righthand side of the reconciliation dialogue you can choose if other columns are used to help the reconciliation service make a match - however it is sometimes hard to tell what use (if any) the reconciliation service makes of these additional columns +- At the bottom of the reconciliation dialogue there is the option to "Auto-match candidates with high confidence". This can be a time saver, but in this case you are going to uncheck it, so you can see the results before a match is made +- Now click 'Start Reconciling' + +Reconciliation is an operation that can take a little time if you have many values to look up. However, in this case there are only 6 publishers to check, so it should work quite quickly. + +Once the reconciliation has completed two Facets should be created automatically: + +- Publisher: Judgement +- Publisher: best candidate's score + +These are two of several specific reconciliation facets and actions that you can get from the 'Reconcile' menu (from the column drop down menu). + +- Close the 'Publisher: best candidate's score' facet, but leave the 'Publisher: Judgement' facet open + +If you look at the Publisher column, you should see some cells have found one or more matches - the potential matches are shown in a list in each cell. Next to each potential match there is a 'tick' and a 'double tick'. To accept a reconciliation match you can use the 'tick' options in cells. The 'tick' accepts the match for the single cell, the 'double tick' accepts the match for all identical cells. + +- Create a text facet on the Publisher column +- Choose 'International Union of Crystallography' + +In the Publisher column you should be able to see the various potential matches. Clicking on a match will take you to the VIAF page for that entity. + +- Click a 'double tick' in one of the Publisher column cells for the option "International Union of Crystallography" +- This will accept this as a match for all cells - you should see the other options all disappear +- Check the 'Publisher: Judgement' facet. This should now show that 858 items are 'matched' (if this does not update, try refreshing the facets) + +We could do these one by one, but if we are confident with the matches, there is an option to accept all: + +- Remove all filters/facets from the project so all rows display +- In the Publisher column use the dropdown menu to choose 'Reconcile->Actions->Match each cell to its best candidate' + +There are two things that reconciliation can do for you. Firstly it gets a standard form of the name or label for the entity. Secondly it gets an ID for the entity - in this case a VIAF id. This is hidden in the default view, but can be extracted: + +- In the Publisher column use the dropdown menu to choose 'Edit column->Add column based on this column...' +- Give the column the name 'VIAF-ID' +- In the GREL expression box type `cell.recon.match.id` +- This will create a new column that contains the VIAF ID for the matched entity + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## Using the ‘cross' function to lookup data in other OpenRefine projects + +As well as looking up data in external systems using the methods described above, it is also possible to look up data in other OpenRefine projects on the same computer. This is done using the ‘cross' function. + +The ‘cross' function takes a value from the OpenRefine project you are working on, and looks for that value in a column in another OpenRefine project. If it finds one or more matching rows in the second OpenRefine project, it returns an array containing the rows that it has matched. As it returns the whole row for each match, you can use a transformation to extract the values from any of the columns in the second project. @@ -159,8 +176,23 @@ You can use this function to compare the contents of two OpenRefine projects, or The VIB-Bits extension adds a number of very useful functions to OpenRefine. One of them is "Add column(s) from other projects...", which provides a dialog window to help you work with the `cross` function with less typing. ->## Extensions ->The functionality in OpenRefine can be enhanced by ‘extensions’ which can be downloaded and installed to add functionality to your OpenRefine installation. -> ->A list of Extensions (not necessarily complete) is given on the OpenRefine downloads page at [http://openrefine.org/download.html](http://openrefine.org/download.html). -{: .callout} +::::::::::::::::::::::::::::::::::::::::: callout + +## Extensions + +The functionality in OpenRefine can be enhanced by ‘extensions' which can be downloaded and installed to add functionality to your OpenRefine installation. + +A list of Extensions (not necessarily complete) is given on the OpenRefine downloads page at [http://openrefine.org/download.html](https://openrefine.org/download.html). + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- OpenRefine can look up custom URLs to fetch data based on what's in an OpenRefine project +- Such API calls can be custom built, or one can use existing Reconciliation services to enrich data +- OpenRefine can be further enhanced by installing extensions + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/data/UCSD_Guardian.csv b/episodes/data/UCSD_Guardian.csv similarity index 100% rename from data/UCSD_Guardian.csv rename to episodes/data/UCSD_Guardian.csv diff --git a/data/doaj-article-sample.csv b/episodes/data/doaj-article-sample.csv similarity index 100% rename from data/doaj-article-sample.csv rename to episodes/data/doaj-article-sample.csv diff --git a/data/petitions-archive-list.xlsx b/episodes/data/petitions-archive-list.xlsx similarity index 100% rename from data/petitions-archive-list.xlsx rename to episodes/data/petitions-archive-list.xlsx diff --git a/data/solar-patents.csv b/episodes/data/solar-patents.csv similarity index 100% rename from data/solar-patents.csv rename to episodes/data/solar-patents.csv diff --git a/files/draft-instructor-notes.md b/episodes/files/draft-instructor-notes.md similarity index 100% rename from files/draft-instructor-notes.md rename to episodes/files/draft-instructor-notes.md diff --git a/index.md b/index.md index 450d4d19..bf396c80 100644 --- a/index.md +++ b/index.md @@ -1,14 +1,22 @@ --- -layout: lesson +site: sandpaper::sandpaper_site --- + This Library Carpentry lesson introduces people working in library- and information-related roles to working with data in OpenRefine. At the conclusion of the lesson you will understand what the OpenRefine software does and how to use the OpenRefine software to work with data files. -> ## Prerequisites -> To complete this lesson you will need to: -> -> 1. Install OpenRefine or use it through a cloud service -> 1. Download a data file -> 1. Use a supported browser -> -> See [our setup page](https://librarycarpentry.org/lc-open-refine/setup.html) for more information. -{: .prereq} +:::::::::::::::::::::::::::::::::::::::::: prereq + +## Prerequisites + +To complete this lesson you will need to: + +1. Install OpenRefine or use it through a cloud service +2. Download a data file +3. Use a supported browser + +See [our setup page](https://librarycarpentry.org/lc-open-refine/setup.html) for more information. + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/_extras/guide.md b/instructors/instructor-notes.md similarity index 54% rename from _extras/guide.md rename to instructors/instructor-notes.md index 17655d99..2fc1ff1c 100644 --- a/_extras/guide.md +++ b/instructors/instructor-notes.md @@ -1,49 +1,55 @@ --- -layout: page -title: "Instructor Notes" +title: Instructor Notes --- -____ +*** + # Tips and Tricks -____ +*** + ## Making a handout -Adapt/print from: +Adapt/print from: -* [Library Carpentry Reference Page](https://librarycarpentry.org/lc-open-refine/reference.html) -* [Instructor Draft Notes](https://github.com/LibraryCarpentry/lc-open-refine/blob/gh-pages/files/draft-instructor-notes.md) -* [Introduction to OpenRefine by Owen Stephens](http://www.meanboyfriend.com/overdue_ideas/wp-content/uploads/2014/11/Introduction-to-OpenRefine-handout-CC-BY.pdf) +- [Library Carpentry Reference Page](https://librarycarpentry.org/lc-open-refine/reference.html) +- [Instructor Draft Notes](https://github.com/LibraryCarpentry/lc-open-refine/blob/gh-pages/files/draft-instructor-notes.md) +- [Introduction to OpenRefine by Owen Stephens](https://www.meanboyfriend.com/overdue_ideas/wp-content/uploads/2014/11/Introduction-to-OpenRefine-handout-CC-BY.pdf) + +*** -____ # General notes on OpenRefine ## Common problems -* If learners are using a browser other than Firefox, or OpenRefine does not automatically open for them when they click the .exe file, have them point their browser at http://127.0.0.1:3333/ or http://localhost:3333 to launch the program. +- If learners are using a browser other than Firefox, or OpenRefine does not automatically open for them when they click the .exe file, have them point their browser at [http://127.0.0.1:3333/](https://127.0.0.1:3333/) or [http://localhost:3333](https://localhost:3333) to launch the program. -* Mac users with the newest operating system will have to allow this to run by "allowing everything" to run. They can change the setting back after the exercise. +- Mac users with the newest operating system will have to allow this to run by "allowing everything" to run. They can change the setting back after the exercise. -* Some students will run into issues with +- Some students will run into issues with + - unzipping - finding the .exe file once the software has been unzipped - finding the data file on their computers after downloading + +- If OpenRefine crashes when launched from a network share drive, do the following: -* If OpenRefine crashes when launched from a network share drive, do the following: - - Copy the OpenRefine folder to a local drive not mapped to a network share, e.g. "C:\Users\JaneDoe" + - Copy the OpenRefine folder to a local drive not mapped to a network share, e.g. "C:\\Users\\JaneDoe" - Open a Windows Command prompt - - Change the working directory to the OpenRefine folder at "C:\Users\JaneDoe" + - Change the working directory to the OpenRefine folder at "C:\\Users\\JaneDoe" - Run openrefine.exe -* If "https" doesn't work to fetch CrossRef during Advanced OpenRefine Functions, they can try "http" +- If "https" doesn't work to fetch CrossRef during Advanced OpenRefine Functions, they can try "http" -* If they need to diagnose failure to fetch the content from the URL they can check the "Store error" option in the "Add column by fetching URLs" dialogue and try looking at the common problems listed in the [documentation](https://docs.openrefine.org/manual/columnediting#common-errors) +- If they need to diagnose failure to fetch the content from the URL they can check the "Store error" option in the "Add column by fetching URLs" dialogue and try looking at the common problems listed in the [documentation](https://docs.openrefine.org/manual/columnediting#common-errors) -* The data for this lesson was pulled from DOAJ in 2015 and may not reflect the same data currently available from DOAJ on the day of your workshop. +- The data for this lesson was pulled from DOAJ in 2015 and may not reflect the same data currently available from DOAJ on the day of your workshop. ## Powerful transformations -* In the titlecase exercise, highlight the fact that +- In the titlecase exercise, highlight the fact that each transformation can have unintended side effects, and advise that running one cleanup operation too few may sometimes be preferable to one too many. + + diff --git a/_extras/discuss.md b/learners/discuss.md similarity index 98% rename from _extras/discuss.md rename to learners/discuss.md index 05e80fa2..0eef9730 100644 --- a/_extras/discuss.md +++ b/learners/discuss.md @@ -1,7 +1,7 @@ --- -layout: page title: Discussion --- + There are many ways to discuss Library Carpentry lessons: - Join our [Gitter discussion forum](https://gitter.im/LibraryCarpentry/).[Deprecated 2021] @@ -9,3 +9,5 @@ There are many ways to discuss Library Carpentry lessons: - Stay in touch with our [Topicbox Group](https://carpentries.topicbox.com/groups/discuss-library-carpentry). - Follow updates on [Twitter](https://twitter.com/LibCarpentry). - Make a suggestion or correct an error by [raising an issue](https://github.com/LibraryCarpentry/lc-open-refine/issues) or submitting a [pull request](https://github.com/LibraryCarpentry/lc-open-refine/pulls). + + diff --git a/learners/reference.md b/learners/reference.md new file mode 100644 index 00000000..839b5967 --- /dev/null +++ b/learners/reference.md @@ -0,0 +1,9 @@ +--- +title: 'Reference' +--- + +## Glossary + +FIXME This is a placeholder file. Please add content here. + + diff --git a/setup.md b/learners/setup.md similarity index 50% rename from setup.md rename to learners/setup.md index 56f221c6..3fb16d93 100644 --- a/setup.md +++ b/learners/setup.md @@ -1,5 +1,4 @@ --- -layout: page title: Setup --- @@ -9,41 +8,42 @@ You need to install OpenRefine and download a data file to follow this lesson. ### Installing and running OpenRefine -OpenRefine is a free, open-source Java application. You can download OpenRefine from -[http://openrefine.org/download.html](http://openrefine.org/download.html). +OpenRefine is a free, open-source Java application. You can download OpenRefine from +[http://openrefine.org/download.html](https://openrefine.org/download.html). This lesson has been tested with all versions of OpenRefine up to the latest tested version, 3.6.1 -Packages are available on for Windows, macOS, and Linux. +Packages are available on [https://openrefine.org/download.html](https://openrefine.org/download.html) for Windows, macOS, and Linux. Please download the latest stable version, choosing the "kit" for your operating system. Current versions of the "Windows kit with embedded Java" and "Mac kit" include everything you need to run OpenRefine. The "Linux kit" and traditional "Windows kit" require a "Java Runtime Environment" (JRE) installed on your system (see notes below). -If you are using an older version of OpenRefine, it is recommended you upgrade to the latest tested version. +If you are using an older version of OpenRefine, it is recommended you upgrade to the latest tested version. -Please follow OpenRefine's manual to [install](https://docs.openrefine.org/manual/installing) and [run](https://docs.openrefine.org/manual/running) it. +Please follow OpenRefine's manual to [install](https://docs.openrefine.org/manual/installing) and [run](https://docs.openrefine.org/manual/running) it. When running OpenRefine, initially a command line window will open. This is a window with a black background. As OpenRefine runs, lines of text will appear in the command line window. Then the Open Refine interface will open in your default web browser. You do not need to interact with the command line window. Leave it open in the background, and work on datasets in your web browser. Notes: -* When you download OpenRefine for Windows or Linux from the address above, you are downloading an archive file -(zip or tar). To install OpenRefine unzip the downloaded file to a permanent location on your computer. This can -be to a personal directory or to an applications or software directory - OpenRefine should run wherever you put the -unzipped folder. The location has to be a "local" drive as problems have been reported trying to run OpenRefine -from a Network drive. -* The options "Windows kit with embedded Java" and "Mac kit" include Java as part of the package. You **do not** -need to install Java if you use one of these kits. This is the preferred method on Windows and Mac systems. -* On Windows, if you use the traditional "Windows kit" without embedded Java, you will need a -"Java Runtime Environment" (JRE) on your system. If you do not already have JRE or JDK installed, -you can visit [Adopt OpenJDK](https://adoptopenjdk.net/) or [Oracle Java](https://java.com/en/download/) -to download an installer package. Please note that -[Oracle significantly changed their license terms in 2019](https://www.oracle.com/java/technologies/javase/jdk-faqs.html) limiting it to "personal use" without a paid license. If you use OpenRefine at work or in research, OpenJDK is preferred. -* On Linux a "Java Runtime Environment" (JRE) will be required to run OpenRefine. If you do not already have -JRE or JDK installed on your system, most distribution repositories will contain OpenJRE / OpenJDK packages. -Install the default version available from your distribution. For example, on Ubuntu/Debian: -`sudo apt install default-jre`. -* OpenRefine does not support Internet Explorer. Please use [Firefox](https://www.mozilla.org/firefox/new/), -[Microsoft Edge](https://www.microsoft.com/edge), -[Chrome](https://www.google.com/chrome/) or [Safari](https://www.apple.com/safari/) instead. + +- When you download OpenRefine for Windows or Linux from the address above, you are downloading an archive file + (zip or tar). To install OpenRefine unzip the downloaded file to a permanent location on your computer. This can + be to a personal directory or to an applications or software directory - OpenRefine should run wherever you put the + unzipped folder. The location has to be a "local" drive as problems have been reported trying to run OpenRefine + from a Network drive. +- The options "Windows kit with embedded Java" and "Mac kit" include Java as part of the package. You **do not** + need to install Java if you use one of these kits. This is the preferred method on Windows and Mac systems. +- On Windows, if you use the traditional "Windows kit" without embedded Java, you will need a + "Java Runtime Environment" (JRE) on your system. If you do not already have JRE or JDK installed, + you can visit [Adopt OpenJDK](https://adoptopenjdk.net/) or [Oracle Java](https://java.com/en/download/) + to download an installer package. Please note that + [Oracle significantly changed their license terms in 2019](https://www.oracle.com/java/technologies/javase/jdk-faqs.html) limiting it to "personal use" without a paid license. If you use OpenRefine at work or in research, OpenJDK is preferred. +- On Linux a "Java Runtime Environment" (JRE) will be required to run OpenRefine. If you do not already have + JRE or JDK installed on your system, most distribution repositories will contain OpenJRE / OpenJDK packages. + Install the default version available from your distribution. For example, on Ubuntu/Debian: + `sudo apt install default-jre`. +- OpenRefine does not support Internet Explorer. Please use [Firefox](https://www.mozilla.org/firefox/new/), + [Microsoft Edge](https://www.microsoft.com/edge), + [Chrome](https://www.google.com/chrome/) or [Safari](https://www.apple.com/safari/) instead. ### OpenRefine cloud services @@ -53,11 +53,11 @@ It's free to use without registration, but it's the older OpenRefine 3.4.1, [restricted to 1-2 GB RAM](https://mybinder.readthedocs.io/en/latest/faq.html#how-much-memory-am-i-given-when-using-binder), and the server will be deleted after 10 minutes of inactivity. -[![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/betatim/openrefineder/6ba108b?urlpath=%2Fopenrefine) +[![](https://mybinder.org/badge.svg){alt='Binder'}](https://mybinder.org/v2/gh/betatim/openrefineder/6ba108b?urlpath=%2Fopenrefine) ### Downloading the data -You can download [doaj-article-sample.csv](https://github.com/LibraryCarpentry/lc-open-refine/raw/gh-pages/data/doaj-article-sample.csv), which is a csv file that will open in a new browser tab. Be sure to right click or control click in order to save the file (NOTE: In Safari, right click and select **download linked file**; in Chrome and Firefox, right click and select **save link as...**). Make a note of the location (i.e. the folder, your desktop) to which you save the file. +You can download [doaj-article-sample.csv](data/doaj-article-sample.csv), which is a csv file that will open in a new browser tab. Be sure to right click or control click in order to save the file (NOTE: In Safari, right click and select **download linked file**; in Chrome and Firefox, right click and select **save link as...**). Make a note of the location (i.e. the folder, your desktop) to which you save the file. ### Exiting OpenRefine @@ -72,14 +72,14 @@ You may also want to check the [Stack Overflow OpenRefine tag](https://stackover There are also general and specialist tutorials about using OpenRefine available on the web, including: -* Official wiki [List of OpenRefine External Resources](https://github.com/OpenRefine/OpenRefine/wiki/External-Resources) -* [Getting started with OpenRefine by Thomas Padilla](http://thomaspadilla.org/dataprep/) -* [Cleaning Data with OpenRefine by Seth van Hooland, Ruben Verborgh and Max De Wilde](http://programminghistorian.org/lessons/cleaning-data-with-openrefine) -* [Blog posts on using OpenRefine from Owen Stephens](http://www.meanboyfriend.com/overdue_ideas/tag/openrefine/?orderby=date&order=ASC) -* [Identifying potential headings for Authority work using III Sierra, MS Excel and OpenRefine](http://epublications.marquette.edu/lib_fac/81/) -* [Free your metadata website](http://freeyourmetadata.org) -* [Data Munging Tools in Preparation for RDF: Catmandu and LODRefine by Christina Harlow](http://journal.code4lib.org/articles/11013) -* [Cleaning Data with OpenRefine by John Little](https://libjohn.github.io/openrefine/) -* [OpenRefine Blog](https://openrefine.org/category/blog.html) - -[template]: {{ site.workshop_repo }} +- Official wiki [List of OpenRefine External Resources](https://github.com/OpenRefine/OpenRefine/wiki/External-Resources) +- [Getting started with OpenRefine by Thomas Padilla](https://thomaspadilla.org/dataprep/) +- [Cleaning Data with OpenRefine by Seth van Hooland, Ruben Verborgh and Max De Wilde](https://programminghistorian.org/lessons/cleaning-data-with-openrefine) +- [Blog posts on using OpenRefine from Owen Stephens](https://www.meanboyfriend.com/overdue_ideas/tag/openrefine/?orderby=date&order=ASC) +- [Identifying potential headings for Authority work using III Sierra, MS Excel and OpenRefine](https://epublications.marquette.edu/lib_fac/81/) +- [Free your metadata website](https://freeyourmetadata.org) +- [Data Munging Tools in Preparation for RDF: Catmandu and LODRefine by Christina Harlow](https://journal.code4lib.org/articles/11013) +- [Cleaning Data with OpenRefine by John Little](https://libjohn.github.io/openrefine/) +- [OpenRefine Blog](https://openrefine.org/category/blog.html) + + diff --git a/profiles/learner-profiles.md b/profiles/learner-profiles.md new file mode 100644 index 00000000..434e335a --- /dev/null +++ b/profiles/learner-profiles.md @@ -0,0 +1,5 @@ +--- +title: FIXME +--- + +This is a placeholder file. Please add content here. diff --git a/reference.md b/reference.md deleted file mode 100644 index d32d8fb4..00000000 --- a/reference.md +++ /dev/null @@ -1,4 +0,0 @@ ---- -layout: reference ---- - diff --git a/site/README.md b/site/README.md new file mode 100644 index 00000000..42997e3d --- /dev/null +++ b/site/README.md @@ -0,0 +1,2 @@ +This directory contains rendered lesson materials. Please do not edit files +here.