Skip to content

Commit

Permalink
fix(scraper): additionally grab common words
Browse files Browse the repository at this point in the history
  • Loading branch information
sigaloid committed Nov 15, 2024
1 parent 62717ef commit 6c64ebd
Show file tree
Hide file tree
Showing 3 changed files with 114 additions and 12 deletions.
46 changes: 46 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ rss = "2.0.7"
arc-swap = "1.7.1"
serde_json_path = "0.6.7"
async-recursion = "1.1.1"
common-words-all = { version = "0.0.2", default-features = false, features = ["english", "one"] }


[dev-dependencies]
Expand Down
79 changes: 67 additions & 12 deletions src/scraper/main.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use std::{fmt::Display, io::Write};
use std::{collections::HashMap, fmt::Display, io::Write};

use clap::{Parser, ValueEnum};
use common_words_all::{get_top, Language, NgramSize};
use redlib::utils::Post;

#[derive(Parser)]
Expand All @@ -10,9 +11,6 @@ struct Cli {
#[arg(short = 's', long = "sub")]
sub: String,

#[arg(short = 'c', long = "count")]
count: usize,

#[arg(long = "sort")]
sort: SortOrder,

Expand Down Expand Up @@ -50,28 +48,85 @@ enum Format {

#[tokio::main]
async fn main() {
pretty_env_logger::init();
let cli = Cli::parse();
let (sub, final_count, sort, format, output) = (cli.sub, cli.count, cli.sort, cli.format, cli.output);
let (sub, sort, format, output) = (cli.sub, cli.sort, cli.format, cli.output);
let initial = format!("/r/{sub}/{sort}.json?&raw_json=1");
let (mut posts, mut after) = Post::fetch(&initial, false).await.unwrap();
while posts.len() < final_count {
let (posts, mut after) = Post::fetch(&initial, false).await.unwrap();
let mut hashmap = HashMap::new();
hashmap.extend(posts.into_iter().map(|post| (post.id.clone(), post)));
loop {
print!("\r");
let path = format!("/r/{sub}/{sort}.json?sort={sort}&t=&after={after}&raw_json=1");
let (new_posts, new_after) = Post::fetch(&path, false).await.unwrap();
posts.extend(new_posts);
let old_len = hashmap.len();
// convert to hashmap and extend hashmap
let new_posts = new_posts.into_iter().map(|post| (post.id.clone(), post)).collect::<HashMap<String, Post>>();
let len = new_posts.len();
hashmap.extend(new_posts);
if hashmap.len() - old_len < 3 {
break;
}

let x = hashmap.len() - old_len;
after = new_after;
// Print number of posts fetched
print!("Fetched {} posts", posts.len());
print!("Fetched {len} posts (+{x})",);
std::io::stdout().flush().unwrap();
}
println!("\n\n");
// additionally search if final count not reached

for word in get_top(Language::English, 10_000, NgramSize::One) {
let mut retrieved_posts_from_search = 0;
let initial = format!("/r/{sub}/search.json?q={word}&restrict_sr=on&include_over_18=on&raw_json=1&sort={sort}");
println!("Grabbing posts with word {word}.");
let (posts, mut after) = Post::fetch(&initial, false).await.unwrap();
hashmap.extend(posts.into_iter().map(|post| (post.id.clone(), post)));
'search: loop {
let path = format!("/r/{sub}/search.json?q={word}&restrict_sr=on&include_over_18=on&raw_json=1&sort={sort}&after={after}");
let (new_posts, new_after) = Post::fetch(&path, false).await.unwrap();
if new_posts.is_empty() || new_after.is_empty() {
println!("No more posts for word {word}");
break 'search;
}
retrieved_posts_from_search += new_posts.len();
let old_len = hashmap.len();
let new_posts = new_posts.into_iter().map(|post| (post.id.clone(), post)).collect::<HashMap<String, Post>>();
let len = new_posts.len();
hashmap.extend(new_posts);
let delta = hashmap.len() - old_len;
after = new_after;
// Print number of posts fetched
println!("Fetched {len} posts (+{delta})",);

if retrieved_posts_from_search > 1000 {
println!("Reached 1000 posts from search");
break 'search;
}
}
// Need to save incrementally. atomic save + move
let tmp_file = output.clone().unwrap_or_else(|| format!("{sub}.json.tmp"));
let perm_file = output.clone().unwrap_or_else(|| format!("{sub}.json"));
write_posts(&hashmap.values().collect(), tmp_file.clone());
// move file
std::fs::rename(tmp_file, perm_file).unwrap();
}

println!("\n\n");

posts.truncate(final_count);
println!("Size of hashmap: {}", hashmap.len());

let posts: Vec<&Post> = hashmap.values().collect();
match format {
Format::Json => {
let filename: String = output.unwrap_or_else(|| format!("{sub}.json"));
let json = serde_json::to_string(&posts).unwrap();
std::fs::write(filename, json).unwrap();
write_posts(&posts, filename);
}
}
}

fn write_posts(posts: &Vec<&Post>, filename: String) {
let json = serde_json::to_string(&posts).unwrap();
std::fs::write(filename, json).unwrap();
}

0 comments on commit 6c64ebd

Please sign in to comment.