Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Additions, updates, fixes, etc. #39

Merged
merged 14 commits into from
Oct 12, 2021
Merged
64 changes: 54 additions & 10 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions schemas/tweet.sql
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,5 @@ CREATE TABLE tweet_file (
FOREIGN KEY (file_id) REFERENCES file (id)
);
CREATE INDEX tweet_file_tweet_id_index ON tweet_file (tweet_id);
CREATE INDEX tweet_file_file_id_index ON tweet_file (file_id);
CREATE INDEX tweet_file_user_id_index ON tweet_file (user_id);
74 changes: 74 additions & 0 deletions src/bin/canonical.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
use std::path::Path;

type Void = Result<(), Box<dyn std::error::Error>>;

#[tokio::main]
async fn main() -> Void {
let args: Vec<String> = std::env::args().collect();
let store_path = args.get(1).unwrap();
let digests_path = args.get(2).unwrap();

use std::io::{self, BufRead};
let file = std::fs::File::open(digests_path)?;
let digests: Vec<String> = io::BufReader::new(file).lines().collect::<Result<_, _>>()?;

use futures::stream::StreamExt;

futures::stream::iter(digests)
.map(|digest| {
let store_path = store_path.clone();
tokio::spawn(async move {
let p1 = format!(
"{}/data/other/{}/{}.gz",
store_path,
digest.chars().next().unwrap(),
digest
);
let p2 = format!(
"{}/data/valid/{}/{}.gz",
store_path,
digest.chars().next().unwrap(),
digest
);
let mut path = Path::new(&p1);

if !path.exists() {
path = Path::new(&p2);
}
let file = std::fs::File::open(path)?;

let mut found = false;
let mut output = "".to_string();

for result in io::BufReader::new(flate2::read::GzDecoder::new(file)).lines() {
let line = result?;
if line.contains("rel=\"canonical\"") || line.contains("rel='canonical'") {
let i = line.find("href=").unwrap();
let link = line
.chars()
.skip(i + 6)
.take_while(|c| *c != '"')
.collect::<String>();
output = format!("{},{}", digest, link);
found = true;
break;
}
}

if !found {
output = format!("{},", digest);
}
let res: std::io::Result<String> = Ok(output);
res
})
})
.buffer_unordered(32)
.for_each(|result| async {
let output = result.unwrap().unwrap();

println!("{}", output);
})
.await;

Ok(())
}
58 changes: 58 additions & 0 deletions src/bin/details.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
use cancel_culture::{browser::twitter::parser, cli, wbm, wbm::digest, wbm::valid};
use clap::{crate_authors, crate_version, Clap};
use csv::ReaderBuilder;
use futures::StreamExt;
use std::collections::HashSet;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;

type Void = Result<(), Box<dyn std::error::Error>>;

#[tokio::main]
async fn main() -> Void {
let opts: Opts = Opts::parse();
let _ = cli::init_logging(opts.verbose);

match opts.command {
SubCommand::Parse { path } => {
let mut file = File::open(path.clone())?;
let html = if path.ends_with(".gz") {
parser::parse_html_gz(&mut file)
} else {
parser::parse_html(&mut file)
};

let mut out = csv::WriterBuilder::new().from_writer(std::io::stdout());

for (a, b, c, d, e, f) in parser::extract_phcs(&html?) {
out.write_record(&[a, b, c, d, e, f.unwrap_or("".to_string())])?;
//println!("{}, {}, {}, {}, {}, {:?}", a, b, c, d, e, f)
}
}
}

Ok(())
}

#[derive(Clap)]
#[clap(name = "details", version = crate_version!(), author = crate_authors!())]
struct Opts {
/// Level of verbosity
#[clap(short, long, parse(from_occurrences))]
verbose: i32,
/// Level of parallelism
#[clap(short, long, default_value = "6")]
parallelism: usize,
#[clap(subcommand)]
command: SubCommand,
}

#[derive(Clap)]
enum SubCommand {
Parse {
/// The file path
#[clap(short, long)]
path: String,
},
}
Loading