Skip to content

Commit

Permalink
Begin integrating wayback-rs
Browse files Browse the repository at this point in the history
  • Loading branch information
travisbrown committed Dec 11, 2021
1 parent 8f6619b commit 9206ae3
Show file tree
Hide file tree
Showing 18 changed files with 395 additions and 292 deletions.
295 changes: 294 additions & 1 deletion Cargo.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ tokio-test = "0.4"
toml = "0.5"
tryhard = "0.4"
url = "2.2"
wayback-rs = "0.1.0"

[features]
bundled-sqlite3 = ["libsqlite3-sys/bundled"]
Expand Down
14 changes: 10 additions & 4 deletions src/bin/report.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use csv::ReaderBuilder;
use itertools::Itertools;
use std::collections::{HashMap, HashSet};
use std::fs::File;
use wayback_rs::Item;

type Void = Result<(), Box<dyn std::error::Error>>;

Expand Down Expand Up @@ -44,11 +45,16 @@ async fn main() -> Void {
if row.len() > 2 && hashes.contains(&row[2]) {
Some((
row[2].to_string(),
wbm::Item::parse_optional(
Item::parse_optional_record(
row.get(0),
row.get(1),
row.get(2),
row.get(3),
if row.len() == 5 {
Some("0")
} else {
row.get(4)
},
if row.len() == 5 {
row.get(4)
} else {
Expand All @@ -61,7 +67,7 @@ async fn main() -> Void {
None
}
})
.collect::<HashMap<String, wbm::Item>>();
.collect::<HashMap<String, Item>>();

log::info!("{} items found", by_digest.len());

Expand Down Expand Up @@ -95,7 +101,7 @@ async fn main() -> Void {
}
}

items.sort_by_key(|(_, item)| item.archived);
items.sort_by_key(|(_, item)| item.archived_at);
items.reverse();

println!(
Expand All @@ -109,7 +115,7 @@ async fn main() -> Void {
println!(
"* Archived as @{} on [{}]({})",
tweet.user_screen_name,
item.archived.format("%e %B %Y"),
item.archived_at.format("%e %B %Y"),
item.wayback_url(false)
);
}
Expand Down
10 changes: 5 additions & 5 deletions src/bin/twcc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -361,8 +361,8 @@ async fn main() -> Result<()> {
.into_iter()
.filter(|item| item.status.is_none() || item.status == Some(200))
.collect::<Vec<_>>();
let last = valid.iter().map(|item| item.archived).max();
let first = valid.into_iter().min_by_key(|item| item.archived);
let last = valid.iter().map(|item| item.archived_at).max();
let first = valid.into_iter().min_by_key(|item| item.archived_at);

first.zip(last).map(|(f, l)| (id, l, f))
})
Expand All @@ -374,12 +374,12 @@ async fn main() -> Result<()> {

let selected = candidates.into_iter().take(limit.unwrap_or(usize::MAX));

let mut by_id: HashMap<u64, wayback::Item> = HashMap::new();
let mut by_id: HashMap<u64, wayback_rs::Item> = HashMap::new();

for (id, _, current) in selected {
match by_id.get(&id) {
Some(latest) => {
if latest.archived < current.archived {
if latest.archived_at < current.archived_at {
by_id.insert(id, current);
}
}
Expand All @@ -400,7 +400,7 @@ async fn main() -> Result<()> {

use cancel_culture::browser::twitter::parser::BrowserTweet;

let mut report_items = HashMap::<u64, (BrowserTweet, wayback::Item)>::new();
let mut report_items = HashMap::<u64, (BrowserTweet, wayback_rs::Item)>::new();

if let Some(s) = store.as_ref() {
let mut items = Vec::with_capacity(by_id.len());
Expand Down
5 changes: 3 additions & 2 deletions src/bin/wbstore.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
use cancel_culture::{
cli,
wayback::{cdx::Client, Item, Result, Store},
wayback::{cdx::Client, Result, Store},
};
use clap::Parser;
use flate2::{write::GzEncoder, Compression, GzBuilder};
use futures::StreamExt;
use std::collections::HashSet;
use std::fs::File;
use std::io::BufRead;
use wayback_rs::Item;

#[tokio::main]
async fn main() -> Result<()> {
Expand Down Expand Up @@ -433,7 +434,7 @@ fn save_contents_gz(item: &Item, base: &str, content: &[u8]) -> Result<()> {
log::info!("Saving {} to {:?} ({})", item.digest, base, item.url);
let file = File::create(std::path::Path::new(base).join(format!("{}.gz", item.digest)))?;
let mut gz = GzBuilder::new()
.filename(item.infer_filename())
.filename(item.make_filename())
.write(file, Compression::default());
gz.write_all(content)?;
gz.finish()?;
Expand Down
2 changes: 1 addition & 1 deletion src/bin/wbvalidate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ async fn main() -> Void {
log::info!("Saving {} to {:?}", actual, path);
let file = File::create(path.join(format!("{}.gz", actual)))?;
let mut gz = GzBuilder::new()
.filename(item.infer_filename())
.filename(item.make_filename())
.write(file, Compression::default());
gz.write_all(&result)?;
gz.finish()?;
Expand Down
2 changes: 1 addition & 1 deletion src/twitter/store/wayback.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
use crate::browser::twitter::parser::BrowserTweet;
use crate::util::sqlite::{SQLiteDateTime, SQLiteId};
use crate::wayback::Item;
use futures_locks::RwLock;
use rusqlite::{params, Connection, DropBehavior, OptionalExtension};
use std::collections::HashSet;
use std::path::Path;
use thiserror::Error;
use wayback_rs::Item;

const DIGEST_SELECT: &str = "SELECT DISTINCT value FROM digest";
const DIGEST_INSERT: &str = "INSERT OR IGNORE INTO digest (tweet_id, value, url) VALUES (?, ?, ?)";
Expand Down
38 changes: 34 additions & 4 deletions src/wayback/cdx.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use super::{Item, Result, Store};
use super::{Result, Store};
use bytes::Bytes;
use flate2::{Compression, GzBuilder};
use futures::{Future, FutureExt, StreamExt, TryStreamExt};
Expand All @@ -9,6 +9,7 @@ use std::io::{BufReader, Read, Write};
use std::ops::Deref;
use std::path::Path;
use std::time::Duration;
use wayback_rs::Item;

pub struct Client {
underlying: RClient,
Expand Down Expand Up @@ -38,10 +39,39 @@ impl Client {
}
}

fn from_row(row: &[String]) -> Result<Item> {
if row.len() == 5 {
Item::parse_optional_record(
Some(&row[0]),
Some(&row[1]),
Some(&row[2]),
Some(&row[3]),
Some("0"),
Some(&row[4]),
)
.map_err(super::Error::from)
} else if row.len() == 6 {
Item::parse_optional_record(
Some(&row[0]),
Some(&row[1]),
Some(&row[2]),
Some(&row[3]),
Some(&row[4]),
Some(&row[5]),
)
.map_err(super::Error::from)
} else {
Err(super::Error::ItemParsingError(format!(
"Invalid item fields: {:?}",
row
)))
}
}

fn decode_rows(rows: Vec<Vec<String>>) -> Result<Vec<Item>> {
rows.into_iter()
.skip(1)
.map(|row| Item::from_row(&row))
.map(|row| Self::from_row(&row))
.collect()
}

Expand Down Expand Up @@ -102,15 +132,15 @@ impl Client {
log::info!("Saving {} to {:?} ({})", actual, good_dir, item.url);
let file = File::create(good_dir.join(format!("{}.gz", actual)))?;
let mut gz = GzBuilder::new()
.filename(item.infer_filename())
.filename(item.make_filename())
.write(file, Compression::default());
gz.write_all(&result)?;
gz.finish()?;
} else {
log::info!("Saving {} to {:?} ({})", item.digest, bad_dir, item.url);
let file = File::create(bad_dir.join(format!("{}.gz", item.digest)))?;
let mut gz = GzBuilder::new()
.filename(item.infer_filename())
.filename(item.make_filename())
.write(file, Compression::default());
gz.write_all(&result)?;
gz.finish()?;
Expand Down
22 changes: 13 additions & 9 deletions src/wayback/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,21 @@ use crate::twitter::store::wayback;
use fantoccini::error::CmdError;
use std::fmt::{Debug, Display, Formatter};
use std::path::PathBuf;
use thiserror::Error;
use tokio::task::JoinError;

#[derive(Debug)]
#[derive(Error, Debug)]
pub enum Error {
ClientError(reqwest::Error),
ClientError(#[from] reqwest::Error),
ItemError(#[from] wayback_rs::item::Error),
ItemParsingError(String),
ItemDecodingError(serde_json::Error),
FileIOError(std::io::Error),
StoreContentsDecodingError(csv::Error),
StoreContentsEncodingError(Box<csv::IntoInnerError<csv::Writer<Vec<u8>>>>),
BrowserError(CmdError),
TaskError(JoinError),
TweetStoreError(wayback::TweetStoreError),
ItemDecodingError(#[from] serde_json::Error),
FileIOError(#[from] std::io::Error),
StoreContentsDecodingError(#[from] csv::Error),
StoreContentsEncodingError(#[from] csv::IntoInnerError<csv::Writer<Vec<u8>>>),
BrowserError(#[from] CmdError),
TaskError(#[from] JoinError),
TweetStoreError(#[from] wayback::TweetStoreError),
DataPathError(PathBuf),
}

Expand All @@ -24,6 +26,7 @@ impl Display for Error {
}
}

/*
impl std::error::Error for Error {}
impl From<reqwest::Error> for Error {
Expand Down Expand Up @@ -73,3 +76,4 @@ impl From<wayback::TweetStoreError> for Error {
Error::TweetStoreError(e)
}
}
*/
120 changes: 0 additions & 120 deletions src/wayback/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,124 +6,4 @@ pub mod web;
pub use error::Error;
pub use store::Store;

use chrono::NaiveDateTime;

pub type Result<T> = std::result::Result<T, Error>;

#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Item {
pub url: String,
pub archived: NaiveDateTime,
pub digest: String,
pub mimetype: String,
pub status: Option<u16>,
}

impl Item {
const DATE_FMT: &'static str = "%Y%m%d%H%M%S";

pub fn new(
url: String,
archived: NaiveDateTime,
digest: String,
mimetype: String,
status: Option<u16>,
) -> Item {
Item {
url,
archived,
digest,
mimetype,
status,
}
}

pub fn wayback_url(&self, original: bool) -> String {
format!(
"http://web.archive.org/web/{}{}/{}",
self.timestamp(),
if original { "id_" } else { "if_" },
self.url
)
}

pub fn timestamp(&self) -> String {
self.archived.format(Item::DATE_FMT).to_string()
}

pub fn status_code(&self) -> String {
self.status.map_or("-".to_string(), |v| v.to_string())
}

pub fn infer_extension(&self) -> Option<String> {
match self.mimetype.as_str() {
"application/json" => Some("json".to_string()),
"text/html" => Some("html".to_string()),
_ => None,
}
}

pub fn infer_filename(&self) -> String {
self.infer_extension().map_or_else(
|| self.digest.clone(),
|ext| format!("{}.{}", self.digest, ext),
)
}

fn parse(
url: &str,
timestamp: &str,
digest: &str,
mimetype: &str,
status: &str,
) -> Result<Item> {
let archived = NaiveDateTime::parse_from_str(timestamp, Item::DATE_FMT)
.map_err(|_| Error::ItemParsingError(format!("Unexpected timestamp: {}", timestamp)))?;

let status_parsed = if status == "-" {
Ok(None)
} else {
status
.parse::<u16>()
.map(Some)
.map_err(|_| Error::ItemParsingError(format!("Unexpected status: {}", status)))
}?;

Ok(Item::new(
url.to_string(),
archived,
digest.to_string(),
mimetype.to_string(),
status_parsed,
))
}

fn parse_optional(
url: Option<&str>,
timestamp: Option<&str>,
digest: Option<&str>,
mimetype: Option<&str>,
status: Option<&str>,
) -> Result<Item> {
Self::parse(
url.ok_or_else(|| Error::ItemParsingError("Missing URL".to_string()))?,
timestamp.ok_or_else(|| Error::ItemParsingError("Missing timestamp".to_string()))?,
digest.ok_or_else(|| Error::ItemParsingError("Missing digest".to_string()))?,
mimetype.ok_or_else(|| Error::ItemParsingError("Missing mimetype".to_string()))?,
status.ok_or_else(|| Error::ItemParsingError("Missing status".to_string()))?,
)
}

fn from_row(row: &[String]) -> Result<Item> {
if row.len() == 5 {
Item::parse(&row[0], &row[1], &row[2], &row[3], &row[4])
} else if row.len() == 6 {
Item::parse(&row[0], &row[1], &row[2], &row[3], &row[5])
} else {
Err(Error::ItemParsingError(format!(
"Invalid item fields: {:?}",
row
)))
}
}
}
Loading

0 comments on commit 9206ae3

Please sign in to comment.