Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 23 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@ For each year, IPEDS splits data into several files - up to several dozen. The d

Each file has a corresponding dictionary .zip, which includes .xls, .xlsx, or .html dictionaries. According to NCES, there is no comprehensive dictionary available.

Beware: variable names frequently change between years. In other cases, the variable name will stay the same but the value levels will change (e.g. 1,2,3 in 2000 and 5,10,15,20 in 2001). I don't have a good answer for comparing between years, besides looking at the data dictionaries. If you have a better answer please share!

Beware: variable names frequently change between years. In other cases, the variable name will stay the same but the value levels will change (e.g. 1,2,3 in 2000 and 5,10,15,20 in 2001). To help with this, I created additional python scripts to pull and assemble the labels from stata do files into a single label dictionary, which can then be applied to the queried data by combination of year, variable name, and label value. This last part occurs in the R script, which has code to replace dummy values in the files with their string values from the labels dictionary.

## Functions
### Scrape list of available files
Expand All @@ -16,21 +15,38 @@ Assembles [data/ipedsfiles.json](data/ipedsfiles.json) with info on all availabl
python3 scripts/scraper.py
```

### Make list of available files available for browsing
Assembles [data/ipedsfiles.json](data/ipedsfiles.json) with info on topics of items for easier searching
```python
python3 scripts/scraperDescriptions.py
```
### Download do files
Download stata do files listed in [data/ipedsfiles.json](data/ipedsfiles.json) for a given range of years.
```python
python3 scripts/downloadStataDoFiles.py STARTYEAR STOPYEAR
```

### Download data files
Download data files listed in [data/ipedsfiles.json](data/ipedsfiles.json) for a given range of years.
```python
python3 scripts/downloadData.py STARTYEAR STOPYEAR
```
### Assemble a master dictionary
Downloads and extracts dictionary files for given years from [data/ipedsfiles.json](data/ipedsfiles.json), compiles the .xls and .xlsx dictionaries into [data/dictionary.csv](data/dictionary.csv)
* Note: pre-2009 dictionaries are saved in .html files and are not parsed here.
```python
python3 scripts/makeDictionary.py STARTYEAR STOPYEAR
```

### Download files
Download data files listed in [data/ipedsfiles.json](data/ipedsfiles.json) for a given range of years.
### Assemble a master data label repository
Reads re-naming conventions from do files and creates a master list of data labels and values by year and file for later matching
```python
python3 scripts/downloadData.py STARTYEAR STOPYEAR
python3 scripts/extract_and_compile_labels.py STARTYEAR STOPYEAR
```



### Get column names
Get column names from downloaded files for a given range of years and save in a json.
```python
python3 scripts/getColumnNames.py STARTYEAR STOPYEAR
```
```
9,965 changes: 0 additions & 9,965 deletions data/dictionary.csv

This file was deleted.

1 change: 0 additions & 1 deletion data/ipedscolumns.json

This file was deleted.

1 change: 0 additions & 1 deletion data/ipedsfiles.json

This file was deleted.

237 changes: 153 additions & 84 deletions scripts/downloadData.py
Original file line number Diff line number Diff line change
@@ -1,84 +1,153 @@
# -*- coding: utf-8 -*-
"""
Download all IPEDS Complete Data Files for a given set of years
Extract and keep final/revised versions
Make a json specifying columns in each data file
Hannah Recht, 04-04-16
"""

from urllib.request import urlopen
import json
import zipfile
import os
import csv
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("start", help="start year",
type=int)
parser.add_argument("stop", help="stop year",
type=int)
args = parser.parse_args()

# Import json of available files, created in scraper.py
with open('data/ipedsfiles.json') as fp:
allfiles = json.load(fp)

# Download all the data in given years
def downloadData(start, stop):
print("*****************************")
print("Downloading data")
print("*****************************")
for i in range(start,stop):
print("Downloading " + str(i) + " data files")
# Make directory for the raw files - one per year
if not os.path.exists('raw/' + str(i) + '/'):
os.makedirs('raw/' + str(i) + '/')
# Download all the files in the json
for f in allfiles:
if(f['year']==i):
# URL to download
url = f['dataurl']
# dataset file name (XXXX.zip)
urlname = url.split("http://nces.ed.gov/ipeds/datacenter/data/",1)[1]
rd = urlopen(url)
saveurl = "raw/" + str(i) +'/' + urlname
# Save the zip files
with open(saveurl, "wb") as p:
p.write(rd.read())
p.close()

# Unzip .zips
zip_ref = zipfile.ZipFile(saveurl, 'r')
zip_ref.extractall("raw/" + str(i) +'/')
zip_ref.close()

# Remove zip file
os.remove("raw/" + str(i) +'/' + urlname)

# Some datasets have been revised over time, so they'll download XXXX.csv and XXXX_rv.csv
# We only want the revised version
def removeDups(start, stop):
print("*****************************")
print("Removing duplicates")
print("*****************************")
for i in range(start,stop):
print("Removing " + str(i) + " duplicates")
files = os.listdir('raw/' + str(i) + '/')
# See how many files are in each year
# print([i,len(files)])
for file in files:
# file name minus '.csv'
name = file[:-4]
# If the file name ends in _rv, keep that one and delete the other (no _rv)
if(name[-3:] =='_rv'):
#print(name)
unrevised = name[:-3]
if(os.path.exists('raw/' + str(i) + '/' + unrevised + '.csv')):
os.remove('raw/' + str(i) + '/' + unrevised + '.csv')
print('Removed ' + unrevised)
# else:
# print('no match ' + unrevised)

downloadData(args.start, args.stop)
removeDups(args.start, args.stop)
# Functions to get data from IPEDS csvs into R, format, join into one long data frame

library("jsonlite")
library("dplyr")
library("stringr")
library("openxlsx")
library("readxl")


ipedspath <- "C:/Users/allubera/Documents/PythonScripts/ipeds-scraper-2025/"
allfiles <- fromJSON(readLines("C:/Users/allubera/Documents/PythonScripts/ipeds-scraper-2025/data/ipedsfiles.json",warn=F))
datacols <- fromJSON(readLines("C:/Users/allubera/Documents/PythonScripts/ipeds-scraper-2025/data/ipedscolumns.json",warn = F))

# IPEDS dictionary
dictionary <- read.csv(paste(ipedspath, "data/dictionary.csv", sep=""), stringsAsFactors = F)

# Join colnames to file info, remove FLAGS datasets, using 1990+
ipeds <- left_join(datacols, allfiles, by = c("name", "year"))
ipeds <- ipeds %>% filter(!grepl("flags", name)) %>%
filter(year >= 1990)

# There are a few duplicates in the way that IPEDS lists its files - remove them
ipeds <-ipeds[!duplicated(ipeds[,"path"]),]

# Search for a variable(s), return list of files that contain it
searchVars <- function(vars) {
# Filter the full IPEDS metadata dataset info to just those containing your vars
dt <- ipeds %>% filter(grepl(paste(vars, collapse='|'), columns, ignore.case = T))
datalist <- split(dt, dt$name)
return(datalist)
}

# Return the datasets containing the var(s) and selected the necessary columns
getData <- function(datalist, vars, keepallvars) {
allvars <- tolower(c(vars, "unitid", "year"))
for (i in seq_along(datalist)) {
# Construct path to CSV
csvpath <- datalist[[i]]$path
fullpath <- paste(ipedspath, csvpath, sep="")
name <- datalist[[i]]$name

print(paste("Reading in ", fullpath, sep = ""))

# Read CSV - some IPEDS CSVs are malformed, containing extra commas at the end of all rows but the headers
# Need to handle these. Permanent solution - send list of malformed files to NCES. This is a known issue.
row1 <- readLines(fullpath, n = 1)
csvnames <- unlist(strsplit(row1,','))
d <- read.table(fullpath, header = F, stringsAsFactors = F, sep=",", skip = 1, na.strings=c("",".","NA"))
if (length(csvnames) == ncol(d)) {
colnames(d) <- csvnames
} else if (length(csvnames) == ncol(d) - 1) {
colnames(d) <- c(csvnames, "xxx")
print("Malformed CSV - extra column without header. Handled by R function but note for NCES.")
} else if ((length(csvnames) != ncol(d) - 1) & length(csvnames) == ncol(d)) {
print("Malformed CSV - unknown column length mismatch error. Note for NCES")
print(path)
}

#d <- read.csv(fullpath, header=T, stringsAsFactors = F, na.strings=c("",".","NA"))
# Give it a year variable
d$year <- datalist[[i]]$year
# All lowercase colnames
colnames(d) <- tolower(colnames(d))

# OPEID can be sometimes integer sometimes character - coerce to character
if("opeid" %in% colnames(d))
{
d$opeid <- as.character(d$opeid)
}
if("f2a20" %in% colnames(d))
{
d$f2a20 <- as.character(d$f2a20)
}
# unitid sometimes has type issues
d$unitid <- as.character(d$unitid)
# Select just the need vars
if(keepallvars == FALSE) {
selects <- intersect(colnames(d), allvars)
d <- d %>% select(one_of(selects))
} else {
d <- d %>% select(-starts_with("x"))
}
assign(name, d, envir = .GlobalEnv)
}
}

# Bind rows to make one data frame
makeDataset <- function(vars) {
dt <- ipeds %>% filter(grepl(paste(vars, collapse='|'), columns, ignore.case = T))
ipeds_list <- lapply(dt$name, get)
ipedsdata <- bind_rows(ipeds_list)
ipedsdata <- ipedsdata %>% arrange(year, unitid)
# Unit id back to numeric
ipedsdata$unitid <- as.numeric(ipedsdata$unitid)
return(ipedsdata)
}

# If desired (usually the case): Do all the things: search, get datasets
returnData <- function(myvars, keepallvars = FALSE) {
dl <- searchVars(myvars)
getData(dl, myvars, keepallvars)
makeDataset(myvars)
}
rm(allfiles, datacols)

# Example - some institutional characteristics
instvars <- c("fips", "stabbr", "instnm", "sector", "pset4flg", "instcat", "ccbasic", "control", "deggrant", "opeflag", "opeind", "opeid", "carnegie", "hloffer")
institutions <- returnData(instvars)

# Make Data interpretable
labels <- read_excel("C:/Users/allubera/Documents/PythonScripts/ipeds-scraper-2025/data/labels.xlsx")
labels <- na.omit(labels)
institutions <- na.omit(institutions)

#remove label title from everything
labels$label_name = gsub("label_", "",labels$label_name)

institutions <- institutions %>%
mutate(across(everything(), as.character))
labels <- labels %>%
mutate(value = as.character(value),
year = as.character(year)) # Ensure year is also character

# Function to replace values with labels safely
replace_values_with_labels <- function(data, labels) {
for (col in colnames(data)) {
if (col %in% labels$label_name) { # Check if column exists in labels
for (year in unique(na.omit(data$year))) { # Remove NA values in year
year_labels <- labels %>%
filter(label_name == col & year == as.character(year)) # Ensure year is compared as character

if (nrow(year_labels) > 0) {
for (i in 1:nrow(year_labels)) {
value_match <- year_labels$value[i]
label_replacement <- year_labels$label[i]

# Check if there are actual matches to avoid NA issues
if (any(data$year == year & data[[col]] == value_match, na.rm = TRUE)) {
data[data$year == year & data[[col]] == value_match, col] <- label_replacement
}
}
}
}
}
}

return(data)
}

# Apply function
updated_data <- replace_values_with_labels(institutions, labels)


87 changes: 87 additions & 0 deletions scripts/downloadStataDoFiles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# -*- coding: utf-8 -*-
"""
Download all IPEDS Complete Do Files for a given set of years
Extract and keep final/revised versions
Make a json specifying columns in each data file
Hannah Recht, 04-04-16

The do files contain the labels for the varaible values.
Stata files do not have a 'save' function, so we will need to run each script and then save the var labels and definitions to get a crosswalk between dummy values and their actual string definitions.
"""

from urllib.request import urlopen
import json
import zipfile
import os
import csv
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("start", help="start year",
type=int)
parser.add_argument("stop", help="stop year",
type=int)
args = parser.parse_args()

# Import json of available files, created in scraper.py
with open('data/ipedsfiles.json') as fp:
allfiles = json.load(fp)

# Download all the data in given years
def downloadData(start, stop):
print("*****************************")
print("Downloading do files")
print("*****************************")
for i in range(start,stop):
print("Downloading " + str(i) + " do files")
# Make directory for the raw files - one per year
if not os.path.exists('dofiles/' + str(i) + '/'):
os.makedirs('dofiles/' + str(i) + '/')
# Download all the files in the json
for f in allfiles:
if(f['year']==i):
# URL to download
url = f['dourl']
# dataset file name (XXXX.zip)
urlname = url.split("http://nces.ed.gov/ipeds/datacenter/data/",1)[1]
rd = urlopen(url)
saveurl = "dofiles/" + str(i) +'/' + urlname
# Save the zip files
with open(saveurl, "wb") as p:
p.write(rd.read())
p.close()

# Unzip .zips
zip_ref = zipfile.ZipFile(saveurl, 'r')
zip_ref.extractall("dofiles/" + str(i) +'/')
zip_ref.close()

# Remove zip file
os.remove("dofiles/" + str(i) +'/' + urlname)

# Some datasets have been revised over time, so they'll download XXXX.csv and XXXX_rv.csv
# We only want the revised version
def removeDups(start, stop):
print("*****************************")
print("Removing duplicates")
print("*****************************")
for i in range(start,stop):
print("Removing " + str(i) + " duplicates")
files = os.listdir('dofiles/' + str(i) + '/')
# See how many files are in each year
# print([i,len(files)])
for file in files:
# file name minus '.csv'
name = file[:-4]
# If the file name ends in _rv, keep that one and delete the other (no _rv)
if(name[-3:] =='_rv'):
#print(name)
unrevised = name[:-3]
if(os.path.exists('dofiles/' + str(i) + '/' + unrevised + '.csv')):
os.remove('dofiles/' + str(i) + '/' + unrevised + '.csv')
print('Removed ' + unrevised)
# else:
# print('no match ' + unrevised)

downloadData(args.start, args.stop)
removeDups(args.start, args.stop)
Loading