CentreForDigitalHumanities · lukavdplas · Jan 12, 2026 · Jan 12, 2026 · Jan 12, 2026 · Jan 12, 2026
diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 [![Python package](https://github.com/CentreForDigitalHumanities/ianalyzer-readers/actions/workflows/python-package.yml/badge.svg)](https://github.com/CentreForDigitalHumanities/ianalyzer-readers/actions/workflows/python-package.yml)
 [![Documentation Status](https://readthedocs.org/projects/ianalyzer-readers/badge/?version=latest)](https://ianalyzer-readers.readthedocs.io/en/latest/?badge=latest)
 
-`ianalyzer-readers` is a python module to extract data from XML, HTML, CSV, JSON, XLSX or RDF (Linked Data) files.
+`ianalyzer-readers` is a python module to extract data from a variety of formats, including CSV, XML, HTML, XLSX, JSON, and RDF.
 
 This module was originally created for [I-analyzer](https://github.com/CentreForDigitalHumanities/I-analyzer), a web application that extracts data from a variety of datasets, indexes them and presents a search interface. To do this, we wanted a way to extract data from source files without having to write a new script "from scratch" for each dataset, and an API that would work the same regardless of the source file type.
 

diff --git a/docs/api.md b/docs/api.md
@@ -42,6 +42,12 @@ __Module:__ `ianalyzer_readers.readers.json`
 
 ::: ianalyzer_readers.readers.json
 
+## RDS reader
+
+__Module:__ `ianalyzer_readers.readers.rds`
+
+::: ianalyzer_readers.readers.rds
+
 ## Extractors
 
 __Module:__ `ianalyzer_readers.extract`

diff --git a/docs/index.md b/docs/index.md
@@ -1,6 +1,6 @@
 # Getting started
 
-`ianalyzer-readers` is a python module to extract data from XML, HTML, CSV or XLSX files.
+`ianalyzer-readers` is a python module to extract data from a variety of formats, including CSV, XML, HTML, XLSX, JSON, and RDF.
 
 This module was originally created for [I-analyzer](https://github.com/CentreForDigitalHumanities/I-analyzer), a web application that extracts data from a variety of datasets, indexes them and presents a search interface. To do this, we wanted a way to extract data from source files without having to write a new script "from scratch" for each dataset, and an API that would work the same regardless of the source file type.
 

diff --git a/ianalyzer_readers/readers/rds.py b/ianalyzer_readers/readers/rds.py
@@ -0,0 +1,28 @@
+from typing import Iterable, Dict
+
+import pyreadr
+import pandas
+
+from .core import Reader
+
+
+class RDSReader(Reader):
+    '''
+    A base class for Readers that extract data from RDS files (containing serialised
+    R dataframes).
+
+    RDS files are parsed using the `pyreadr` library.
+
+    Only file sources are supported. Row values can be extracted with the CSV extractor.
+    '''
+
+    def data_from_file(self, path: str) -> Iterable[Dict]:
+        result = pyreadr.read_r(path)
+        data: pandas.DataFrame = result['data']
+
+        for _, row in data.iterrows():
+            yield {index: value for index, value in row.items()}
+
+    def iterate_data(self, data: Iterable[Dict], metadata: Dict):
+        for row in data:
+            yield {'rows': [row]} # format is for compatability with the CSV extractor
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,6 +19,7 @@ dependencies = [
   "pandas",
   "requests",
   "rdflib",
+  "pyreadr>=0.5.4",
 ]
 
 description = "Utilities for extracting XML, HTML, CSV, XLSX, and RDF data with a common interface"

diff --git a/requirements.txt b/requirements.txt
@@ -21,9 +21,13 @@ numpy==2.3.3
 openpyxl==3.1.5
     # via ianalyzer_readers (setup.py)
 pandas==2.3.3
-    # via ianalyzer_readers (setup.py)
+    # via
+    #   ianalyzer_readers (setup.py)
+    #   pyreadr
 pyparsing==3.2.5
     # via rdflib
+pyreadr==0.5.4
+    # via ianalyzer_readers (setup.py)
 python-dateutil==2.9.0.post0
     # via pandas
 pytz==2025.2

diff --git a/tests/rds/data/hamlet.rds b/tests/rds/data/hamlet.rds
diff --git a/tests/rds/generate_file.R b/tests/rds/generate_file.R
@@ -0,0 +1,34 @@
+# script used to generate ./data/hamlet.rds
+
+data <- data.frame(
+    act=1,
+    scene=5,
+    character=c(
+        "",
+        "HAMLET",
+        "GHOST",
+        "HAMLET",
+        "GHOST",
+        "GHOST",
+        "GHOST",
+        "HAMLET",
+        "GHOST",
+        "GHOST",
+        "HAMLET"
+    ),
+    line=c(
+        "SCENE V. A more remote part of the Castle.",
+        "Whither wilt thou lead me? Speak, I'll go no further.",
+        "Mark me.",
+        "I will.",
+        "My hour is almost come,",
+        "When I to sulph'rous and tormenting flames",
+        "Must render up myself.",
+        "Alas, poor ghost!",
+        "Pity me not, but lend thy serious hearing",
+        "To what I shall unfold.",
+        "Speak, I am bound to hear."
+    )
+)
+
+save(data, file="data/hamlet.rds")
diff --git a/tests/rds/test_rds_reader.py b/tests/rds/test_rds_reader.py
@@ -0,0 +1,37 @@
+import os
+
+from ianalyzer_readers.readers.rds import RDSReader
+from ianalyzer_readers.readers.core import Field
+from ianalyzer_readers.extract import Constant, CSV
+
+class HamletReader(RDSReader):
+    data_directory = os.path.dirname(__file__) + '/data'
+
+    def sources(self):
+        for filename in os.listdir(self.data_directory):
+            yield os.path.join(self.data_directory, filename)
+
+    fields = [
+        Field(
+            name='play',
+            extractor=Constant('Hamlet'),
+        ),
+        Field(
+            name='character',
+            extractor=CSV('character')
+        ),
+        Field(
+            name='line',
+            extractor=CSV('line')
+        ),
+    ]
+
+def test_rds_reader():
+    reader = HamletReader()
+    docs = list(reader.documents())
+    assert len(docs) == 11
+    assert docs[1] == {
+        'play': 'Hamlet',
+        'character': 'HAMLET',
+        'line': 'Whither wilt thou lead me? Speak, I\'ll go no further.',
+    }