rust 简单爬虫

[dependencies]
scraper = "0.12.0"
reqwest = { version = "0.11.10", features = ["blocking", "json"] }
surf = "2.3.2"
tokio = { version = "1.17.0", features = ["full"] }
futures = "0.3.21"

main函数

#[tokio::main]
async fn main() -> surf::Result<()>{
    let stdin = env::args().nth(1).take().unwrap();
    let paths = vec![stdin.to_string(),];
    let result_list = join_all(paths.into_iter().map(|path|{
        fetch_path(path)
    })).await;

    let mut list_string:Vec<String> = vec![];
    for ele in result_list.into_iter(){
        if ele.is_ok(){
            list_string.push(ele.unwrap())
        }else {
            return Err(ele.unwrap_err())
        }
    }
    println!("{}",list_string);
    Ok(())
}

主要逻辑

async fn fetch_path(path:String) -> surf::Result<String>{
    let mut back_string = String::new();
    match surf::get(&path).await {
        Ok(mut response) => {
            match response.body_string().await{
                Ok(text) =>{
                    back_string = format!("{}",text)
                }
                Err(_) => {
                    println!("Read response text Error!")
                }
            };
        }
        Err(_) => {
            println!("reqwest get Error!")
        }
    }
    Ok(back_string)
}

在main加入html过滤

async fn main() -> surf::Result<()>{
    let paths = vec![stdin.to_string(),];
    let result_list = join_all(paths.into_iter().map(|path|{
        fetch_path(path)
    })).await;

    let mut list_string:Vec<String> = vec![];
    for ele in result_list.into_iter(){
        if ele.is_ok(){
            list_string.push(ele.unwrap())
        }else {
            return Err(ele.unwrap_err())
        }
    }
    let v = list_string.get(0).take().unwrap();
   //  println!("{}",v);
    let fragment = Html::parse_fragment(v);
    let ul_selector = Selector::parse("script").unwrap();

    for element in fragment.select(&ul_selector) {
        println!("{}",element.inner_html());
    }   
    // println!("请求输出:{:?}",list_string);
    Ok(())
}

完整代码

use futures::future::join_all;
use std::env;
use scraper::{Html, Selector};

async fn fetch_path(path:String) -> surf::Result<String>{
    let mut back_string = String::new();
    match surf::get(&path).await {
        Ok(mut response) => {
            match response.body_string().await{
                Ok(text) =>{
                    back_string = format!("{}",text)
                }
                Err(_) => {
                    println!("Read response text Error!")
                }
            };
        }
        Err(_) => {
            println!("reqwest get Error!")
        }
    }
    Ok(back_string)
}


#[tokio::main]
async fn main() -> surf::Result<()>{
    let stdin = env::args().nth(1).take().unwrap();
    let paths = vec![stdin.to_string(),];
    let result_list = join_all(paths.into_iter().map(|path|{
        fetch_path(path)
    })).await;

    let mut list_string:Vec<String> = vec![];
    for ele in result_list.into_iter(){
        if ele.is_ok(){
            list_string.push(ele.unwrap())
        }else {
            return Err(ele.unwrap_err())
        }
    }
    let v = list_string.get(0).take().unwrap();
   //  println!("{}",v);
    let fragment = Html::parse_fragment(v);
    let ul_selector = Selector::parse("script").unwrap();

    for element in fragment.select(&ul_selector) {
        println!("{}",element.inner_html());
    }   
    // println!("请求输出:{:?}",list_string);
    Ok(())
}

运行

cargo run -- https://www.baidu.com/

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值