基于xhs爬取项目的优化测试_SHELL

源项目地址https://github.com/cv-cat/Spider_XHS

优化1:便捷使用方式

#!/bin/bash
# Writen by Looper 2024.01.11

# 变量入参

home_old_url=$(awk '{line[NR]=$0} END{print line[NR-2]}' home.py | awk -F "/" '{print $NF}' | awk -F "'" '{print $1}')

one_old_url=$(awk '{line[NR]=$0} END{print line[NR-2]}' one.py | awk -F "/" '{print $NF}' | awk -F "'" '{print $1}')

search_query_old=$(awk '{line[NR]=$0} END{print line[NR-10]}' search.py | awk -F "'" '{print $2}')
search_number_old=$(awk '{line[NR]=$0} END{print line[NR-8]}' search.py | awk -F " " '{print $3}')
search_sort_old=$(awk '{line[NR]=$0} END{print line[NR-6]}' search.py | awk -F "'" '{print $2}')


# case选项栏

while true; do

cat <<EOF
【Spider_XHS_助手,Writen by Looper】
|
1.手动填写登录Cookies参数(需通过网页手动获取)
|
2.自动填写登录Cookies参数(需复制cookies.txt文件至程序目录下)
|
3.多用户下载(下载用户列表所有的笔记)
|
4.多笔记下载(下载笔记列表里所有的笔记)
|
5.下载搜索内容
|
6.退出
EOF

        echo -e  "\e[37;4;44m===== 首次登录请先填写Cookies参数 =====\e[0m" "\n"

        read -p "▶请选择功能对象:" num
        
        # case选举
        case $num in

        1)
        echo -e "\e[37;4;44m===== 开始配置Cookies参数 =====\e[0m"

        read -p "请输入sec_poison_id: " sec_poison_id_data
        if [ -z "$sec_poison_id_data" ];then 
                echo "Error:输入不能为空"
        else
                sed -i 's#"sec_poison_id": "[^"]*"#"sec_poison_id": "'"$sec_poison_id_data"'"#' static/cookies.txt 
        fi

        read -p "请输入gid: " gid_data
        if [ -z "$gid_data" ];then
                echo "Error:输入不能为空"
        else
                sed -i 's#"gid": "[^"]*"#"gid": "'"$gid_data"'"#' static/cookies.txt
        fi

        read -p "请输入a1: " a1_data
        if [ -z "$a1_data" ];then
                echo "Error:输入不能为空"
        else
                sed -i 's#"a1": "[^"]*"#"a1": "'"$a1_data"'"#' static/cookies.txt
        fi

        read -p "请输入websectiga: " websectiga_data
        if [ -z "$websectiga_data" ];then
                echo "Error:输入不能为空"
        else
                sed -i 's#"websectiga": "[^"]*"#"websectiga": "'"$websectiga_data"'"#' static/cookies.txt
        fi

        read -p "请输入webId: " webId_data
        if [ -z "$webId_data" ];then
                echo "Error:输入不能为空"
        else
                sed -i 's#"webId": "[^"]*"#"webId": "'"$webId_data"'"#' static/cookies.txt
        fi

        read -p "请输入web_session: " web_session_data
        if [ -z "$web_session_data" ];then
                echo "Error:输入不能为空"
        else
                sed -i 's#"web_session": "[^"]*"#"web_session": "'"$web_session_data"'"#' static/cookies.txt
        fi

        read -p "请输入xsecappid: " xsecappid_data
        if [ -z "$xsecappid_data" ];then
                echo "Error:输入不能为空"
        else
                sed -i 's#"xsecappid": "[^"]*"#"xsecappid": "'"$xsecappid_data"'"#' static/cookies.txt
        fi

        read -p "请输入webBuild: " webBuild_data
        if [ -z "$webBuild_data" ];then
                echo "Error:输入不能为空"
        else
                sed -i 's#"webBuild": "[^"]*"#"webBuild": "'"$webBuild_data"'"#' static/cookies.txt
        fi

        if [ $? -eq 0 ];then
                echo -e "\e[37;4;42m===== Cookies参数配置成功! =====\e[0m"
        else
                echo -e "\e[37;4;41m===== 配置有误,请检查 =====\e[0m"
        fi
        
        ;;
        2)

        echo -e "\e[37;4;44m===== 开始配置Cookies参数 =====\e[0m"
        if [ -f cookies.txt ];then
                bash cookies_check.sh > cookies_check.txt
                sec_poison_id_auto=$(cat cookies_check.txt |grep sec_poison_id= |awk -F "=" '{print $2}')
                gid_auto=$(cat cookies_check.txt |grep gid= |awk -F "=" '{print $2}')
                a1_auto=$(cat cookies_check.txt |grep a1= |awk -F "=" '{print $2}')
                websectiga_auto=$(cat cookies_check.txt |grep websectiga= |awk -F "=" '{print $2}')
                webId_auto=$(cat cookies_check.txt |grep webId= |awk -F "=" '{print $2}')
                web_session_auto=$(cat cookies_check.txt |grep web_session= |awk -F "=" '{print $2}')
                xsecappid_auto=$(cat cookies_check.txt |grep xsecappid= |awk -F "=" '{print $2}')
                webBuild_auto=$(cat cookies_check.txt |grep webBuild= |awk -F "=" '{print $2}')
                
                sed -i 's#"sec_poison_id": "[^"]*"#"sec_poison_id": "'"$sec_poison_id_auto"'"#' static/cookies.txt
                sed -i 's#"gid": "[^"]*"#"gid": "'"$gid_auto"'"#' static/cookies.txt
                sed -i 's#"a1": "[^"]*"#"a1": "'"$a1_auto"'"#' static/cookies.txt
                sed -i 's#"websectiga": "[^"]*"#"websectiga": "'"$websectiga_auto"'"#' static/cookies.txt
                sed -i 's#"webId": "[^"]*"#"webId": "'"$webId_auto"'"#' static/cookies.txt
                # 由于无法获取到session
                # sed -i 's#"web_session": "[^"]*"#"web_session": "'"$web_session_auto"'"#' static/cookies.txt
                sed -i 's#"xsecappid": "[^"]*"#"xsecappid": "'"$xsecappid_auto"'"#' static/cookies.txt
                sed -i 's#"webBuild": "[^"]*"#"webBuild": "'"$webBuild_auto"'"#' static/cookies.txt
        fi

        if [ $? -eq 0 ];then
                echo -e "\e[37;4;42m===== Cookies参数配置成功! =====\e[0m"
        else
                echo -e "\e[37;4;41m===== 配置有误,请检查 =====\e[0m"
        fi
        
        ;;
        3)

        read -p "请输入用户主页url后缀地址: " home_new_url
        if [ -z "$home_new_url" ];then 
                echo "Error:输入不能为空"
        else
                sed -i "s#${home_old_url}#${home_new_url}#g" home.py
                echo -e "\e[37;4;44m===== 开始获取用户数据 =====\e[0m"
                winpty python home.py
        if [ $? -eq 0 ];then
                echo -e "\e[37;4;42m===== 用户数据获取成功 =====\e[0m" 
        else
                echo -e "\e[37;4;41m===== 用户数据获取失败,请检查 =====\e[0m"
        fi
        fi

        ;;
        4)
        
         read -p "请输入用户笔记url后缀地址: " one_new_url
        if [ -z "$one_new_url" ];then 
                echo "Error:输入不能为空"
        else
                sed -i "s#${one_old_url}#${one_new_url}#g" one.py
                echo -e "\e[37;4;44m===== 开始获取笔记数据 =====\e[0m"
                winpty python one.py
        if [ $? -eq 0 ];then
                echo -e "\e[37;4;42m===== 笔记数据获取成功 =====\e[0m" 
        else
                echo -e "\e[37;4;41m===== 笔记数据获取失败,请检查 =====\e[0m"
        fi
        fi

        ;;
        5)
        read -p "请输入搜索关键词: " search_query_new
        if [ -z "$search_query_new" ];then 
                echo "Error:输入不能为空"
        else
                sed -i "s#${search_query_old}#${search_query_new}#g" search.py
        fi

        read -p "请输入下载数量(前多少个): " search_number_new
        if [ -z "$search_number_new" ];then 
                echo "Error:输入不能为空"
        else
                sed -i "s#${search_number_old}#${search_number_new}#g" search.py
        fi

        read -p "请输入搜索排序方式(general: 综合排序 popularity_descending: 热门排序 time_descending: 最新排序): " search_sort_new
        if [ -z "$search_sort_new" ];then 
                echo "Error:输入不能为空"
        else
                sed -i "s#${search_sort_old}#${search_sort_new}#g" search.py
                echo -e "\e[37;4;44m===== 开始获取搜索数据 =====\e[0m"
                winpty python search.py
                bash csv_search.sh
        if [ $? -eq 0 ];then
                echo -e "\e[37;4;42m===== 搜索数据获取成功 =====\e[0m" 
        else
                echo -e "\e[37;4;41m===== 搜索数据获取失败,请检查 =====\e[0m"
        fi
        fi

        ;;
        6)
        exit 0

        ;;
        *)
        echo -e "\e[37;4;41m===== 无效的选择。请重新选择[1-6] =====\e[0m"
        read -p "按任意键继续..." -n 1 -s
        
        ;;
        esac
done

优化2:集成表格数据

#!/bin/bash

# 保存目录
search_bak_dir=datas_bak/$(date "+%Y%m%d-%H%M%S")/

# 去除源文件空行(包括空格符)
find . -name detail.txt -exec sed -i '/^[[:space:]]*$/d' {} +

# 对源文件指定目标段重新排序
for file in $(find datas_search -name detail.txt);do
    awk '/笔记描述: /,/记ip归属地/ {ORS = (/记ip归属地/ ? RS : " "); if (/记ip归属地/) {print ""; next}} {print}' ${file} > ${file}_output.txt
done

# 对源文件进行制表,导出CSV
find datas_search -name "*output.txt" -print0 | while IFS= read -r -d $'\0' search_file; do
    awk -F ': ' '{printf "%s%s", $2, (NR==1 ? "," : ",") } END {printf "\n"}' "$search_file" >> datas_search/output.csv
done

# 添加表头(无法在while循环中添加,会重复),并换路径保存
if [ $? -eq 0 ];then
    sed -i '1s/^/笔记url,笔记类型,笔记标题,笔记描述,笔记点赞数量,笔记收藏数量,笔记评论数量,笔记分享数量,笔记上传时间,笔记标签\n/' datas_search/output.csv
    mkdir -p ${search_bak_dir}
    mv datas_search/* ${search_bak_dir}
    else echo -e "\e[37;4;41m===== CSV表格创建失败 =====\e[0m"
fi

 其他:

cookies_check.sh

#!/bin/bash
# Writen by Looper 2024.01.11

# 读取文件内容
file_content=$(cat cookies.txt)

# 将内容按 ';' 分割成数组
IFS=';' read -ra parts <<< "$file_content"

# 遍历数组并打印每个元素
for part in "${parts[@]}"; do
  echo "$part"
done

文件结构

Readme

【获取cookies】
1、复制以下JavaScript代码,粘贴至浏览器地址栏
javascript: (function() { const a = document.createElement('a');  a.href = 'data:text/plain,' + document.
cookie;  a.download = 'cookies.txt';  a.target = '_blank';  a.style.display = 'none';  document.body.
appendChild(a);  a.click();  setTimeout(function() {    document.body.removeChild(a);  }, 100);})();

2、CTRL+A全选地址栏文本,按左方向箭头将光标移至最前

3、在最前方手动输入以下内容(注意英文输入法,且: 后方有一位空格): 
Javascript: 

4、按下回车键,下载cookies.txt 文件

5、将cookies.txt 文件复制到程序目录下

6、在程序目录下空白出右击鼠标,选择打开:Open Git Bash here

7、在终端框内输入执行命令:bash get_xhs_info.sh

8、按照终端提示进行配置即可

  • 9
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值