Skip to content

Commit

Permalink
Better sort logic, more e2e tests
Browse files Browse the repository at this point in the history
  • Loading branch information
ezefranca committed Jul 24, 2024
1 parent dcfcb58 commit 283e051
Show file tree
Hide file tree
Showing 2 changed files with 142 additions and 93 deletions.
203 changes: 110 additions & 93 deletions Sources/GoogleScholarSwift/GoogleScholarFetcher.swift
Original file line number Diff line number Diff line change
Expand Up @@ -35,82 +35,99 @@ public class GoogleScholarFetcher {
maxPublications: Int? = nil,
sortBy: SortBy = .cited,
completion: @escaping ([Publication]?, Error?) -> Void) {

var allPublications: [Publication] = []
var startIndex = 0
let pageSize = 100
var totalFetched = 0

func fetchPage() {
guard maxPublications == nil || totalFetched < maxPublications! else {
completion(allPublications, nil)
return
}

guard var urlComponents = URLComponents(string: "https://scholar.google.com/citations") else {
completion(nil, NSError(domain: "Invalid URL", code: 0, userInfo: nil))
return
}
urlComponents.queryItems = [
URLQueryItem(name: "user", value: authorID),
URLQueryItem(name: "oi", value: "ao"),
URLQueryItem(name: "cstart", value: String(startIndex)),
URLQueryItem(name: "pagesize", value: String(pageSize)),
URLQueryItem(name: "sortby", value: sortBy.rawValue)
]
var allPublications: [Publication] = []
var startIndex = 0
let pageSize = 100
var totalFetched = 0

guard let url = urlComponents.url else {
completion(nil, NSError(domain: "Invalid URL Components", code: 0, userInfo: nil))
return
}

var request = URLRequest(url: url)
for (header, value) in Constants.headers {
request.addValue(value, forHTTPHeaderField: header)
}
for (cookie, value) in Constants.cookies {
request.addValue("\(cookie)=\(value)", forHTTPHeaderField: "Cookie")
}

let task = session.dataTask(with: request) { data, response, error in
guard let data = data, error == nil else {
completion(nil, error)
func fetchPage() {
guard maxPublications == nil || totalFetched < maxPublications! else {
sortAndComplete()
return
}

do {
guard let html = String(data: data, encoding: .utf8) else {
completion(nil, NSError(domain: "Invalid Data", code: 0, userInfo: nil))
guard var urlComponents = URLComponents(string: "https://scholar.google.com/citations") else {
completion(nil, NSError(domain: "Invalid URL", code: 0, userInfo: nil))
return
}
urlComponents.queryItems = [
URLQueryItem(name: "user", value: authorID),
URLQueryItem(name: "oi", value: "ao"),
URLQueryItem(name: "cstart", value: String(startIndex)),
URLQueryItem(name: "pagesize", value: String(pageSize)),
URLQueryItem(name: "sortby", value: sortBy.rawValue)
]

guard let url = urlComponents.url else {
completion(nil, NSError(domain: "Invalid URL Components", code: 0, userInfo: nil))
return
}

var request = URLRequest(url: url)
for (header, value) in Constants.headers {
request.addValue(value, forHTTPHeaderField: header)
}
for (cookie, value) in Constants.cookies {
request.addValue("\(cookie)=\(value)", forHTTPHeaderField: "Cookie")
}

let task = session.dataTask(with: request) { data, response, error in
guard let data = data, error == nil else {
completion(nil, error)
return
}
let doc: Document = try SwiftSoup.parse(html)
let publications = try self.parsePublications(doc)

if let maxPublications = maxPublications {
let remaining = maxPublications - totalFetched
let slicedPublications = Array(publications.prefix(remaining))
allPublications.append(contentsOf: slicedPublications)
totalFetched += slicedPublications.count
} else {
allPublications.append(contentsOf: publications)
totalFetched += publications.count
do {
guard let html = String(data: data, encoding: .utf8) else {
completion(nil, NSError(domain: "Invalid Data", code: 0, userInfo: nil))
return
}
let doc: Document = try SwiftSoup.parse(html)
let publications = try self.parsePublications(doc)

if let maxPublications = maxPublications {
let remaining = maxPublications - totalFetched
let slicedPublications = Array(publications.prefix(remaining))
allPublications.append(contentsOf: slicedPublications)
totalFetched += slicedPublications.count
} else {
allPublications.append(contentsOf: publications)
totalFetched += publications.count
}

if publications.count < pageSize {
sortAndComplete()
} else {
startIndex += pageSize
fetchPage()
}
} catch {
completion(nil, error)
}

if publications.count < pageSize {
completion(allPublications, nil)
} else {
startIndex += pageSize
fetchPage()
}
task.resume()
}

func sortAndComplete() {
var mutablePublications = allPublications

switch sortBy {
case .cited:
mutablePublications.sort { (pub1: Publication, pub2: Publication) -> Bool in
return pub1.citations > pub2.citations
}
case .pubdate:
mutablePublications.sort { (pub1: Publication, pub2: Publication) -> Bool in
return pub1.year > pub2.year
}
} catch {
completion(nil, error)
}

completion(mutablePublications, nil)
}
task.resume()

fetchPage()
}

fetchPage()
}

/// Fetches the detailed information for a specific article.
///
Expand All @@ -133,40 +150,40 @@ public class GoogleScholarFetcher {
public func fetchArticleDetails(
articleDetails: ArticleDetails,
completion: @escaping (Article?, Error?) -> Void) {

guard let url = URL(string: articleDetails.link) else {
completion(nil, NSError(domain: "Invalid URL", code: 0, userInfo: nil))
return
}

var request = URLRequest(url: url)
for (header, value) in Constants.headers {
request.addValue(value, forHTTPHeaderField: header)
}
for (cookie, value) in Constants.cookies {
request.addValue("\(cookie)=\(value)", forHTTPHeaderField: "Cookie")
}

let task = session.dataTask(with: request) { data, response, error in
guard let data = data, error == nil else {
completion(nil, error)

guard let url = URL(string: articleDetails.link) else {
completion(nil, NSError(domain: "Invalid URL", code: 0, userInfo: nil))
return
}

do {
guard let html = String(data: data, encoding: .utf8) else {
completion(nil, NSError(domain: "Invalid Data", code: 0, userInfo: nil))
var request = URLRequest(url: url)
for (header, value) in Constants.headers {
request.addValue(value, forHTTPHeaderField: header)
}
for (cookie, value) in Constants.cookies {
request.addValue("\(cookie)=\(value)", forHTTPHeaderField: "Cookie")
}

let task = session.dataTask(with: request) { data, response, error in
guard let data = data, error == nil else {
completion(nil, error)
return
}
let doc: Document = try SwiftSoup.parse(html)
let article = try self.parseArticle(doc)
completion(article, nil)
} catch {
completion(nil, error)

do {
guard let html = String(data: data, encoding: .utf8) else {
completion(nil, NSError(domain: "Invalid Data", code: 0, userInfo: nil))
return
}
let doc: Document = try SwiftSoup.parse(html)
let article = try self.parseArticle(doc)
completion(article, nil)
} catch {
completion(nil, error)
}
}
task.resume()
}
task.resume()
}

/// Parses the publication data from the HTML document.
///
Expand Down Expand Up @@ -210,7 +227,7 @@ public class GoogleScholarFetcher {

return Article(title: title, authors: authors, publicationDate: publicationDate, publication: publication, description: description, totalCitations: totalCitations)
}

private func selectValue(in doc: Document, withIndex index: Int, defaultValue: String = "") throws -> String {
let fieldElements = try doc.select(".gs_scl")
if index < fieldElements.count {
Expand All @@ -221,7 +238,7 @@ public class GoogleScholarFetcher {
}
return defaultValue
}

private func selectTotalCitations(in doc: Document) throws -> String {
if let citationElement = try doc.select(".gsc_oci_value a[href*='cites']").first() {
let citationText = try citationElement.text()
Expand All @@ -231,7 +248,7 @@ public class GoogleScholarFetcher {
}
return ""
}

private func extractNumber(from text: String) -> String? {
let pattern = "\\d+"
if let range = text.range(of: pattern, options: .regularExpression) {
Expand Down
32 changes: 32 additions & 0 deletions Tests/GoogleScholarFetcherTests/GoogleScholarSwiftTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,38 @@ final class GoogleScholarFetcherTests: XCTestCase {
waitForExpectations(timeout: 10, handler: nil)
}

func test_FetchPublications_pubdate() {
let fetcher = GoogleScholarFetcher()
let authorID = "RefX_60AAAAJ"
let maxPublications = 1
let expectation = self.expectation(description: "Fetching publications with limit")

fetcher.fetchAllPublications(authorID: authorID, maxPublications: maxPublications, sortBy: .pubdate) { publications, error in
XCTAssertNil(error, "Error should be nil")
XCTAssertEqual(publications?.count, maxPublications, "Number of publications should match the limit")
XCTAssertTrue(Int(publications![0].year)! >= 2023)
expectation.fulfill()
}

waitForExpectations(timeout: 10, handler: nil)
}

func test_FetchPublications_citation() {
let fetcher = GoogleScholarFetcher()
let authorID = "RefX_60AAAAJ"
let maxPublications = 1
let expectation = self.expectation(description: "Fetching publications with limit")

fetcher.fetchAllPublications(authorID: authorID, maxPublications: maxPublications, sortBy: .cited) { publications, error in
XCTAssertNil(error, "Error should be nil")
XCTAssertEqual(publications?.count, maxPublications, "Number of publications should match the limit")
XCTAssertTrue(Int(publications![0].citations)! > 2400)
expectation.fulfill()
}

waitForExpectations(timeout: 10, handler: nil)
}

func test_FetchArticleDetails() {
let fetcher = GoogleScholarFetcher()
let authorID = "RefX_60AAAAJ"
Expand Down

0 comments on commit 283e051

Please sign in to comment.