Add data_import rake tasks

gwu-libraries · alepbloyd · Jan 16, 2025 · Sep 26, 2024 · Sep 26, 2024 · Sep 26, 2024
commit 858368ff20505e6611b5a52d97f8eea4ed6ae031
diff --git a/rails/lib/tasks/load_authors.rake b/rails/lib/tasks/load_authors.rake
@@ -0,0 +1,38 @@
+require 'rake'
+require 'zlib'
+require 'csv'
+
+namespace :data_import do
+  desc 'Load Authors from gzipped csv to db'
+  task load_authors: :environment do
+    file_paths = Dir['/opt/bookworm/csv-files/authors/author_split*']
+
+    file_paths.each do |file_path|
+      authors = []
+      Zlib::GzipReader.open(file_path) do |gzip|
+        csv = CSV.new(gzip)
+        csv
+          .drop(1)
+          .each_with_index do |row, index| # drop(1) handles the header row
+            authors << {
+              openalex_id: row[0].split('/').last,
+              orcid: row[1],
+              display_name: row[2],
+              display_name_alternatives: row[3],
+              works_count: row[4].to_i,
+              cited_by_count: row[5].to_i,
+              last_known_institution: row[6],
+              works_api_url: row[7]
+            }
+
+            if authors.count >= 100
+              Author.insert_all(authors)
+
+              authors = []
+            end
+          end
+      end
+      Author.insert_all(authors)
+    end
+  end
+end
diff --git a/rails/lib/tasks/load_authors_counts_by_years.rake b/rails/lib/tasks/load_authors_counts_by_years.rake
@@ -0,0 +1,42 @@
+require 'rake'
+require 'zlib'
+require 'csv'
+
+namespace :data_import do
+  desc 'Load AuthorsCountsByYear from gzipped csv to db'
+  task load_authors_counts_by_year: :environment do
+    file_paths =
+      Dir[
+        '/opt/bookworm/csv-files/authors_counts_by_year/authors_counts_by_year_split*'
+      ]
+
+    file_paths.each do |file_path|
+      authors_counts_by_year = []
+      Zlib::GzipReader.open(file_path) do |gzip|
+        csv = CSV.new(gzip)
+
+        csv.each_with_index do |row, index| # drop(1) handles the header row
+          author = Author.find_by(openalex_id: row[0].split('/').last)
+
+          unless author.nil?
+            authors_counts_by_year << {
+              author_id: author.id,
+              year: row[1],
+              works_count: row[2],
+              cited_by_count: row[3],
+              oa_works_count: row[4]
+            }
+          end
+
+          if authors_counts_by_year.count >= 1000
+            AuthorsCountsByYear.insert_all(authors_counts_by_year)
+
+            authors_counts_by_year = []
+          end
+        end
+      end
+
+      AuthorsCountsByYear.insert_all(authors_counts_by_year)
+    end
+  end
+end
diff --git a/rails/lib/tasks/load_authors_ids.rake b/rails/lib/tasks/load_authors_ids.rake
@@ -0,0 +1,42 @@
+require 'rake'
+require 'zlib'
+require 'csv'
+
+namespace :data_import do
+  desc 'Load AuthorsIds from gzipped csv to db'
+  task load_authors_ids: :environment do
+    file_paths = Dir['/opt/bookworm/csv-files/authors_ids/authors_ids_split*']
+
+    file_paths.each do |file_path|
+      authors_ids = []
+      Zlib::GzipReader.open(file_path) do |gzip|
+        csv = CSV.new(gzip)
+        csv
+          .drop(1)
+          .each_with_index do |row, index| # drop(1) handles the header row
+            author = Author.find_by(openalex_id: row[1].split('/').last)
+
+            unless author.nil?
+              authors_ids << {
+                author_id: author.id,
+                openalex: row[1].split('/').last,
+                orcid: row[2],
+                scopus: row[3],
+                twitter: row[4],
+                wikipedia: row[5],
+                mag: row[6]
+              }
+            end
+
+            if authors_ids.count >= 100
+              AuthorsIds.insert_all(authors_ids)
+
+              authors_ids = []
+            end
+          end
+      end
+
+      AuthorsIds.insert_all(authors_ids)
+    end
+  end
+end
diff --git a/rails/lib/tasks/load_institutions.rake b/rails/lib/tasks/load_institutions.rake
@@ -1,30 +1,36 @@
 require 'rake'
-require "zlib"
+require 'zlib'
 require 'csv'
 
 namespace :data_import do
-  desc "Load Institutions from gzipped csv to db"
-  task :load_institutions => :environment do
+  desc 'Load Institutions from gzipped csv to db'
+  task load_institutions: :environment do
+    institutions = []
 
-    Zlib::GzipReader.open('/opt/bookworm/csv-files/institutions.csv.gz') do |gzip|
+    Zlib::GzipReader.open(
+      '/opt/bookworm/csv-files/institutions.csv.gz'
+    ) do |gzip|
       csv = CSV.new(gzip)
-      csv.drop(1).each_with_index do |row, index| # drop(1) handles the header row
-        Institution.create(
-          openalex_id: row[0].split("/").last,
-          ror: row[1],
-          display_name: row[2],
-          country_code: row[3],
-          institution_type: row[4],
-          homepage_url: row[5],
-          image_url: row[6],
-          image_thumbnail_url: row[7],
-          display_name_acronyms: row[8],
-          display_name_alternatives: row[9],
-          works_count: row[10],
-          cited_by_count: row[11],
-          works_api_url: row[12]
-        )
-      end
+      csv
+        .drop(1)
+        .each_with_index do |row, index| # drop(1) handles the header row
+          institutions << {
+            openalex_id: row[0].split('/').last,
+            ror: row[1],
+            display_name: row[2],
+            country_code: row[3],
+            institution_type: row[4],
+            homepage_url: row[5],
+            image_url: row[6],
+            image_thumbnail_url: row[7],
+            display_name_acronyms: row[8],
+            display_name_alternatives: row[9],
+            works_count: row[10],
+            cited_by_count: row[11],
+            works_api_url: row[12]
+          }
+        end
     end
+    Institution.insert_all(institutions)
   end
 end
diff --git a/rails/lib/tasks/load_institutions_associated_institutions.rake b/rails/lib/tasks/load_institutions_associated_institutions.rake
@@ -0,0 +1,45 @@
+require 'rake'
+require 'zlib'
+require 'csv'
+
+namespace :data_import do
+  desc 'Load InstitutionsAssociatedInstitutions from gzipped csv to db'
+  task load_institutions_associated_institutions: :environment do
+    institutions_associated_institutions = []
+
+    Zlib::GzipReader.open(
+      '/opt/bookworm/csv-files/institutions_associated_institutions.csv.gz'
+    ) do |gzip|
+      csv = CSV.new(gzip)
+
+      csv
+        .drop(1)
+        .each_with_index do |row, index| # drop(1) handles the header row
+          institution = Institution.find_by(openalex_id: row[0].split('/').last)
+
+          associated_institution =
+            Institution.find_by(openalex_id: row[1].split('/').last)
+
+          unless institution.nil? || associated_institution.nil?
+            institutions_associated_institutions << {
+              institution_id: institution.id,
+              associated_institution_id: associated_institution.id,
+              relationship: row[2]
+            }
+          end
+
+          if institutions_associated_institutions.count >= 100
+            InstitutionsAssociatedInstitutions.insert_all(
+              institutions_associated_institutions
+            )
+
+            institutions_associated_institutions = []
+          end
+        end
+    end
+
+    InstitutionsAssociatedInstitutions.insert_all(
+      institutions_associated_institutions
+    )
+  end
+end
diff --git a/rails/lib/tasks/load_institutions_counts_by_year.rake b/rails/lib/tasks/load_institutions_counts_by_year.rake
@@ -0,0 +1,38 @@
+require 'rake'
+require 'zlib'
+require 'csv'
+
+namespace :data_import do
+  desc 'Load InstitutionsCountsByYear from gzipped csv to db'
+  task load_institutions_counts_by_year: :environment do
+    institutions_counts_by_years = []
+
+    Zlib::GzipReader.open(
+      '/opt/bookworm/csv-files/institutions_counts_by_year.csv.gz'
+    ) do |gzip|
+      csv = CSV.new(gzip)
+
+      csv
+        .drop(1)
+        .each_with_index do |row, index| # drop(1) handles the header row
+          institution = Institution.find_by(openalex_id: row[0].split('/').last)
+
+          institutions_counts_by_years << {
+            institution_id: institution.id,
+            year: row[1],
+            works_count: row[2],
+            cited_by_count: row[3],
+            oa_works_count: row[4]
+          }
+
+          if institutions_counts_by_years.count >= 100
+            InstitutionsCountsByYear.insert_all(institutions_counts_by_years)
+
+            institutions_counts_by_years = []
+          end
+        end
+    end
+
+    InstitutionsCountsByYear.insert_all(institutions_counts_by_years)
+  end
+end
diff --git a/rails/lib/tasks/load_institutions_geo.rake b/rails/lib/tasks/load_institutions_geo.rake
@@ -1,27 +1,40 @@
 require 'rake'
-require "zlib"
+require 'zlib'
 require 'csv'
 
 namespace :data_import do
-  desc "Load InstitutionsGeo from gzipped csv to db"
-  task :load_institutions_geo => :environment do
+  desc 'Load InstitutionsGeo from gzipped csv to db'
+  task load_institutions_geo: :environment do
+    institutions_geos = []
 
-    Zlib::GzipReader.open('/opt/bookworm/csv-files/institutions_geo.csv.gz') do |gzip|
+    Zlib::GzipReader.open(
+      '/opt/bookworm/csv-files/institutions_geo.csv.gz'
+    ) do |gzip|
       csv = CSV.new(gzip)
-      csv.drop(1).each_with_index do |row, index| # drop(1) handles the header row
-        institution = Institution.find_by(openalex_id: row[0].split("/").last)
+      csv
+        .drop(1)
+        .each_with_index do |row, index| # drop(1) handles the header row
+          institution = Institution.find_by(openalex_id: row[0].split('/').last)
 
-        InstitutionsGeo.create(
-          institution_id: institution.id,
-          city: row[1],
-          geonames_city_id: row[2],
-          region: row[3],
-          country_code: row[4],
-          country: row[5],
-          latitude: row[6],
-          longitude: row[7]
-        )
-      end
+          institutions_geos << {
+            institution_id: institution.id,
+            city: row[1],
+            geonames_city_id: row[2],
+            region: row[3],
+            country_code: row[4],
+            country: row[5],
+            latitude: row[6],
+            longitude: row[7]
+          }
+
+          if institutions_geos.count >= 100
+            InstitutionsGeo.insert_all(institutions_geos)
+
+            institutions_geos = []
+          end
+        end
     end
+
+    InstitutionsGeo.insert_all(institutions_geos)
   end
 end
diff --git a/rails/lib/tasks/load_institutions_ids.rake b/rails/lib/tasks/load_institutions_ids.rake
@@ -1,26 +1,33 @@
 require 'rake'
-require "zlib"
+require 'zlib'
 require 'csv'
 
 namespace :data_import do
-  desc "Load InstitutionsIds from gzipped csv to db"
-  task :load_institutions_ids => :environment do
+  desc 'Load InstitutionsIds from gzipped csv to db'
+  task load_institutions_ids: :environment do
+    institution_ids = []
 
-    Zlib::GzipReader.open('/opt/bookworm/csv-files/institutions_ids.csv.gz') do |gzip|
+    Zlib::GzipReader.open(
+      '/opt/bookworm/csv-files/institutions_ids.csv.gz'
+    ) do |gzip|
       csv = CSV.new(gzip)
-      csv.drop(1).each_with_index do |row, index| # drop(1) handles the header row
-        institution = Institution.find_by(openalex_id: row[0].split("/").last)
+      csv
+        .drop(1)
+        .each_with_index do |row, index| # drop(1) handles the header row
+          institution = Institution.find_by(openalex_id: row[0].split('/').last)
 
-        InstitutionsIds.create(
-          institution_id: institution.id,
-          openalex: row[0].split("/").last,
-          ror: row[2],
-          grid: row[3],
-          wikipedia: row[4],
-          wikidata: row[5],
-          mag: row[6]
-        )
-      end
+          institution_ids << {
+            institution_id: institution.id,
+            openalex: row[0].split('/').last,
+            ror: row[2],
+            grid: row[3],
+            wikipedia: row[4],
+            wikidata: row[5],
+            mag: row[6]
+          }
+        end
     end
+
+    InstitutionsIds.insert_all(institution_ids)
   end
 end