urlencode
urlencode
# 对字典进行URL编码
# https://blog.csdn.net/weixin_43411585/article/details/88854544
pyquery
pyquery
# pyquery 是类型jquery 的一个专供python使用的html解析的库
# https://www.cnblogs.com/gj5379/p/8514535.html
# 标签
'''
from pyquery import PyQuery as pq
with open("index.html", "r") as f:
contents = f.read()
doc = pq(contents)
text = doc("h2").text()
print(text)
'''
# 属性
# https://geek-docs.com/python/python-tutorial/python-pyquery.html#ftoc-heading-1
jsonpath
jsonPaht
# https://blog.csdn.net/nd211314555/article/details/88426529
xpath
xpath
https:/www.bilibili.com/h5/note-app/view?cvid=17158089&pagefrom=comment
scrapy
#Scrapy,Python开发的一个快速、高层次的屏幕抓取和web抓取框架,用于抓取web站点并从页面中提取结构化的数据。
#Scrapy用途广泛,可以用于数据挖掘、监测和自动化测试.
# https://blog.csdn.net/ck784101777/article/details/104468780
ProxyPool
检验
#!/bin/bash
rm -rf ./log*/*
redis-cli -h 192.168.3.13 -p 6379 ZRANGE proxies:universal 0 -1 >pool
for i in $(cat pool)
do
{
cmd="curl -s -o /dev/null --connect-timeout 3 -w %{http_code} --proxy $i https://www.youtube.com/"
code=`$cmd`
if [[ $code == "200" ]] ; then
echo " $i ---> SUCCESS " | tee ./log2/${i}_proxy_can_access_youtube
else
echo " $i ---> Failure "
fi
}&
done
参考
Window下Redis的安装和部署详细图文教程(Redis的安装和可视化工具的使用)
通过curl获取HTTP状态返回码