源项目地址:https://github.com/cv-cat/Spider_XHS
优化1:便捷使用方式
#!/bin/bash
# Writen by Looper 2024.01.11
# 变量入参
home_old_url=$(awk '{line[NR]=$0} END{print line[NR-2]}' home.py | awk -F "/" '{print $NF}' | awk -F "'" '{print $1}')
one_old_url=$(awk '{line[NR]=$0} END{print line[NR-2]}' one.py | awk -F "/" '{print $NF}' | awk -F "'" '{print $1}')
search_query_old=$(awk '{line[NR]=$0} END{print line[NR-10]}' search.py | awk -F "'" '{print $2}')
search_number_old=$(awk '{line[NR]=$0} END{print line[NR-8]}' search.py | awk -F " " '{print $3}')
search_sort_old=$(awk '{line[NR]=$0} END{print line[NR-6]}' search.py | awk -F "'" '{print $2}')
# case选项栏
while true; do
cat <<EOF
【Spider_XHS_助手,Writen by Looper】
|
1.手动填写登录Cookies参数(需通过网页手动获取)
|
2.自动填写登录Cookies参数(需复制cookies.txt文件至程序目录下)
|
3.多用户下载(下载用户列表所有的笔记)
|
4.多笔记下载(下载笔记列表里所有的笔记)
|
5.下载搜索内容
|
6.退出
EOF
echo -e "\e[37;4;44m===== 首次登录请先填写Cookies参数 =====\e[0m" "\n"
read -p "▶请选择功能对象:" num
# case选举
case $num in
1)
echo -e "\e[37;4;44m===== 开始配置Cookies参数 =====\e[0m"
read -p "请输入sec_poison_id: " sec_poison_id_data
if [ -z "$sec_poison_id_data" ];then
echo "Error:输入不能为空"
else
sed -i 's#"sec_poison_id": "[^"]*"#"sec_poison_id": "'"$sec_poison_id_data"'"#' static/cookies.txt
fi
read -p "请输入gid: " gid_data
if [ -z "$gid_data" ];then
echo "Error:输入不能为空"
else
sed -i 's#"gid": "[^"]*"#"gid": "'"$gid_data"'"#' static/cookies.txt
fi
read -p "请输入a1: " a1_data
if [ -z "$a1_data" ];then
echo "Error:输入不能为空"
else
sed -i 's#"a1": "[^"]*"#"a1": "'"$a1_data"'"#' static/cookies.txt
fi
read -p "请输入websectiga: " websectiga_data
if [ -z "$websectiga_data" ];then
echo "Error:输入不能为空"
else
sed -i 's#"websectiga": "[^"]*"#"websectiga": "'"$websectiga_data"'"#' static/cookies.txt
fi
read -p "请输入webId: " webId_data
if [ -z "$webId_data" ];then
echo "Error:输入不能为空"
else
sed -i 's#"webId": "[^"]*"#"webId": "'"$webId_data"'"#' static/cookies.txt
fi
read -p "请输入web_session: " web_session_data
if [ -z "$web_session_data" ];then
echo "Error:输入不能为空"
else
sed -i 's#"web_session": "[^"]*"#"web_session": "'"$web_session_data"'"#' static/cookies.txt
fi
read -p "请输入xsecappid: " xsecappid_data
if [ -z "$xsecappid_data" ];then
echo "Error:输入不能为空"
else
sed -i 's#"xsecappid": "[^"]*"#"xsecappid": "'"$xsecappid_data"'"#' static/cookies.txt
fi
read -p "请输入webBuild: " webBuild_data
if [ -z "$webBuild_data" ];then
echo "Error:输入不能为空"
else
sed -i 's#"webBuild": "[^"]*"#"webBuild": "'"$webBuild_data"'"#' static/cookies.txt
fi
if [ $? -eq 0 ];then
echo -e "\e[37;4;42m===== Cookies参数配置成功! =====\e[0m"
else
echo -e "\e[37;4;41m===== 配置有误,请检查 =====\e[0m"
fi
;;
2)
echo -e "\e[37;4;44m===== 开始配置Cookies参数 =====\e[0m"
if [ -f cookies.txt ];then
bash cookies_check.sh > cookies_check.txt
sec_poison_id_auto=$(cat cookies_check.txt |grep sec_poison_id= |awk -F "=" '{print $2}')
gid_auto=$(cat cookies_check.txt |grep gid= |awk -F "=" '{print $2}')
a1_auto=$(cat cookies_check.txt |grep a1= |awk -F "=" '{print $2}')
websectiga_auto=$(cat cookies_check.txt |grep websectiga= |awk -F "=" '{print $2}')
webId_auto=$(cat cookies_check.txt |grep webId= |awk -F "=" '{print $2}')
web_session_auto=$(cat cookies_check.txt |grep web_session= |awk -F "=" '{print $2}')
xsecappid_auto=$(cat cookies_check.txt |grep xsecappid= |awk -F "=" '{print $2}')
webBuild_auto=$(cat cookies_check.txt |grep webBuild= |awk -F "=" '{print $2}')
sed -i 's#"sec_poison_id": "[^"]*"#"sec_poison_id": "'"$sec_poison_id_auto"'"#' static/cookies.txt
sed -i 's#"gid": "[^"]*"#"gid": "'"$gid_auto"'"#' static/cookies.txt
sed -i 's#"a1": "[^"]*"#"a1": "'"$a1_auto"'"#' static/cookies.txt
sed -i 's#"websectiga": "[^"]*"#"websectiga": "'"$websectiga_auto"'"#' static/cookies.txt
sed -i 's#"webId": "[^"]*"#"webId": "'"$webId_auto"'"#' static/cookies.txt
# 由于无法获取到session
# sed -i 's#"web_session": "[^"]*"#"web_session": "'"$web_session_auto"'"#' static/cookies.txt
sed -i 's#"xsecappid": "[^"]*"#"xsecappid": "'"$xsecappid_auto"'"#' static/cookies.txt
sed -i 's#"webBuild": "[^"]*"#"webBuild": "'"$webBuild_auto"'"#' static/cookies.txt
fi
if [ $? -eq 0 ];then
echo -e "\e[37;4;42m===== Cookies参数配置成功! =====\e[0m"
else
echo -e "\e[37;4;41m===== 配置有误,请检查 =====\e[0m"
fi
;;
3)
read -p "请输入用户主页url后缀地址: " home_new_url
if [ -z "$home_new_url" ];then
echo "Error:输入不能为空"
else
sed -i "s#${home_old_url}#${home_new_url}#g" home.py
echo -e "\e[37;4;44m===== 开始获取用户数据 =====\e[0m"
winpty python home.py
if [ $? -eq 0 ];then
echo -e "\e[37;4;42m===== 用户数据获取成功 =====\e[0m"
else
echo -e "\e[37;4;41m===== 用户数据获取失败,请检查 =====\e[0m"
fi
fi
;;
4)
read -p "请输入用户笔记url后缀地址: " one_new_url
if [ -z "$one_new_url" ];then
echo "Error:输入不能为空"
else
sed -i "s#${one_old_url}#${one_new_url}#g" one.py
echo -e "\e[37;4;44m===== 开始获取笔记数据 =====\e[0m"
winpty python one.py
if [ $? -eq 0 ];then
echo -e "\e[37;4;42m===== 笔记数据获取成功 =====\e[0m"
else
echo -e "\e[37;4;41m===== 笔记数据获取失败,请检查 =====\e[0m"
fi
fi
;;
5)
read -p "请输入搜索关键词: " search_query_new
if [ -z "$search_query_new" ];then
echo "Error:输入不能为空"
else
sed -i "s#${search_query_old}#${search_query_new}#g" search.py
fi
read -p "请输入下载数量(前多少个): " search_number_new
if [ -z "$search_number_new" ];then
echo "Error:输入不能为空"
else
sed -i "s#${search_number_old}#${search_number_new}#g" search.py
fi
read -p "请输入搜索排序方式(general: 综合排序 popularity_descending: 热门排序 time_descending: 最新排序): " search_sort_new
if [ -z "$search_sort_new" ];then
echo "Error:输入不能为空"
else
sed -i "s#${search_sort_old}#${search_sort_new}#g" search.py
echo -e "\e[37;4;44m===== 开始获取搜索数据 =====\e[0m"
winpty python search.py
bash csv_search.sh
if [ $? -eq 0 ];then
echo -e "\e[37;4;42m===== 搜索数据获取成功 =====\e[0m"
else
echo -e "\e[37;4;41m===== 搜索数据获取失败,请检查 =====\e[0m"
fi
fi
;;
6)
exit 0
;;
*)
echo -e "\e[37;4;41m===== 无效的选择。请重新选择[1-6] =====\e[0m"
read -p "按任意键继续..." -n 1 -s
;;
esac
done
优化2:集成表格数据
#!/bin/bash
# 保存目录
search_bak_dir=datas_bak/$(date "+%Y%m%d-%H%M%S")/
# 去除源文件空行(包括空格符)
find . -name detail.txt -exec sed -i '/^[[:space:]]*$/d' {} +
# 对源文件指定目标段重新排序
for file in $(find datas_search -name detail.txt);do
awk '/笔记描述: /,/记ip归属地/ {ORS = (/记ip归属地/ ? RS : " "); if (/记ip归属地/) {print ""; next}} {print}' ${file} > ${file}_output.txt
done
# 对源文件进行制表,导出CSV
find datas_search -name "*output.txt" -print0 | while IFS= read -r -d $'\0' search_file; do
awk -F ': ' '{printf "%s%s", $2, (NR==1 ? "," : ",") } END {printf "\n"}' "$search_file" >> datas_search/output.csv
done
# 添加表头(无法在while循环中添加,会重复),并换路径保存
if [ $? -eq 0 ];then
sed -i '1s/^/笔记url,笔记类型,笔记标题,笔记描述,笔记点赞数量,笔记收藏数量,笔记评论数量,笔记分享数量,笔记上传时间,笔记标签\n/' datas_search/output.csv
mkdir -p ${search_bak_dir}
mv datas_search/* ${search_bak_dir}
else echo -e "\e[37;4;41m===== CSV表格创建失败 =====\e[0m"
fi
其他:
cookies_check.sh
#!/bin/bash
# Writen by Looper 2024.01.11
# 读取文件内容
file_content=$(cat cookies.txt)
# 将内容按 ';' 分割成数组
IFS=';' read -ra parts <<< "$file_content"
# 遍历数组并打印每个元素
for part in "${parts[@]}"; do
echo "$part"
done
文件结构
Readme
【获取cookies】
1、复制以下JavaScript代码,粘贴至浏览器地址栏
javascript: (function() { const a = document.createElement('a'); a.href = 'data:text/plain,' + document.
cookie; a.download = 'cookies.txt'; a.target = '_blank'; a.style.display = 'none'; document.body.
appendChild(a); a.click(); setTimeout(function() { document.body.removeChild(a); }, 100);})();
2、CTRL+A全选地址栏文本,按左方向箭头将光标移至最前
3、在最前方手动输入以下内容(注意英文输入法,且: 后方有一位空格):
Javascript:
4、按下回车键,下载cookies.txt 文件
5、将cookies.txt 文件复制到程序目录下
6、在程序目录下空白出右击鼠标,选择打开:Open Git Bash here
7、在终端框内输入执行命令:bash get_xhs_info.sh
8、按照终端提示进行配置即可