一、Python爬虫中经常使用的小技巧总结
0.python终端替换输出,在后面的字符替换成\r就行
print(f'Hello Word',end='\r')
1. 绕过https验证
添加参数:verify=False
res = requests.post(url=url,headers=headers,data=data,verify=False)
2. 屏蔽绕过HTTPS后,还提示的报错信息
import urllib3
urllib3.disable_warnings()
3. 只获取当前文件的绝对路径位置,并且不会因为执行位置的不同而发生改变
import pathlib
pathlib.Path(__file__).parent
4. window端初始化兼容Liunx终端输出颜色字体
import colorama
colorama.init()
5. 配合使用gevent协程与time.sleep(1)睡眠函数终止退出多线程
from gevent import monkey;monkey.patch_all()
6、常用的pyppeteer 的通用设计代码
使用pyppeteer 不能很好的关闭chorme浏览器,导致计算机资源严重超载,建议使用playwright,毕竟pyppeteer已经没有人在维护了,而新出playwright的用法跟pyppeteer差不多,并且资源和功能也管理的很好。
新模拟器playwright的Python代码封装_冒险岛_0_的博客-CSDN博客
import asyncio
from pyppeteer import launch
from lxml import etree
# 全局变量
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.8',
'Accept-Encoding': 'gzip',
}
# 封装xpath正则匹配html函数
def get_xpath(html,r):
tree = etree.HTML(html)
data = tree.xpath(r)
return data
# cookie='ANID=AHWqTUnSgwb_CzE_dVnzQ3Eqbzu5REOhyUS0tU5m4sShSCeaso0OhWflzXdOwAhQ; CONSISTENCY=AKJVzcotAi6Qh8s5nHBkNb-XB5SKd1f_GN6KOwvPanuHPgJ4sJEpuK0BQXPvf5hGxRNmbRJOUGtaR_Y8yzv6-TbJgz48QnJ2-SNqu_O6q4copWOIzM4vY0N6UqxQuBjTWQ50inMQDnZUyiB5gJUHBaHQo-_gSZshbXwQf6x9JIoJd8cNdGac43v8SQKu5rTRAPjIB-TYIn_4; SID=6QfNqbL3D7C9uZWknIn0-HpaxxKvoGFRBKx2KKoqaWzbkvNGEb-SkXghwVQ4J0SOPDZWLA.; __Secure-3PSID=6QfNqbL3D7C9uZWknIn0-HpaxxKvoGFRBKx2KKoqaWzbkvNGW6SohDuMlenW2656HjpYRw.; HSID=AnUnk0Xd_PDbr1CgX; SSID=AAz0gxoTZWfwIXoX-; APISID=tV_fhNoeMURfM_xz/A-gOvA5tVlZDtAxmL; SAPISID=wIMSRhDeOiUH-Clu/AvXnLhTae9Mrx1-Su; __Secure-3PAPISID=wIMSRhDeOiUH-Clu/AvXnLhTae9Mrx1-Su; 1P_JAR=2021-02-01-15; SEARCH_SAMESITE=CgQI45EB; NID=208=cqjZ5t2_whUAPKRhLPeLcJHes0ZYhdmfXTzpT7e1jAq8FUVW6NT3RTAu-y5hSKbnaNoBVI1Py4NR1ERpncr7Tu9c0jjAasfVKo7wZkUUsHAGhvzcsOGRtg_C77UvgMaZ2jJLuJfxx4FtwkecfmvvNwiYmwmWv24e13DXwFIxVnDWKsX4uxAv0mujj5qqcmLFDJoDLgGMreYDuUa5; OSID=6QfNqV9JWXrGwgwF2Sj26bQpr3qjdOHCKwNEdfVoLjdinBPshNdmKSqli1hQvskM-ohYRg.; __Secure-OSID=6QfNqV9JWXrGwgwF2Sj26bQpr3qjdOHCKwNEdfVoLjdinBPsOA_Z8pNgf_oLjWr_v0ByYg.; __utmc=245730968; __utmt_t0=1; OTZ=5831467_24_24__24_; __utma=245730968.1064800813.1612191997.1612191997.1612192039.2; __utmz=245730968.1612192039.2.2.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); __utmb=245730968.7.9.1612192080268; SIDCC=AJi4QfFGrDO33DqFaCYZ6a7Xq3WUtgQhLbU3vUfThTF2otcsyFczqVas0SMd21VBpQb0Qllf; __Secure-3PSIDCC=AJi4QfFcw_eACVXEuWmc9ItdYHCpIVc3VHQL5VVO66DjSPr07boScqhaB8qbmag6FxSElH1Gdg'
#创建一个浏览器对象
async def main(url,cookie):
page_text = None
# 创建一个新的浏览器
try:
#测试专用
# browser = await launch(headless=False, dumpio=True, autoClose=False,
# args=['--no-sandbox', '--window-size=1920,1080', '--disable-infobars']) # 进入有头模式,打开浏览器
# browser = await launch({"headless": False},args=['--window-size=-1,-1'])
browser = await launch(
{
"headless": True, #设置为False为显示打开的浏览器
'handleSIGINT': False,
'handleSIGTERM': False,
'handleSIGHUP': False,
# 'userDataDir':'G:/BaiduNetdiskDownload/cache' #解决报错 Unable to remove Temporary User Data 启动浏览器时指定参数userDataDir存放缓存,保证硬盘大且不是系统盘
},
# args=[
# '--start-maximized' #页面全屏
# ]
# ignoreDefaultArgs=['--enable-automation'] #移除Chrome正受到自动测试软件的控制
)
page = await browser.newPage()
# 设置Cookie
await page.setExtraHTTPHeaders({'Cookie': cookie})
# js为设置webdriver的值,防止网站检测
await page.evaluate('''() =>{ Object.defineProperties(navigator,{ webdriver:{ get: () => false } }) }''')
await page.goto(url=url,headers=headers,timeout=100000)
page_text = await page.content()
# 关闭浏览器
await browser.close()
except Exception as e:
print(e)
# 返回内容
return page_text
# 封装获取html函数
def get_pyppeteer(url,cookie=None):
if cookie is None:
cookie = ''
# url = 'https://www.google.com/search?q=site:baidu.com&btnG=Search&hl=en-US&biw=&bih=&gbv=1&start=1&filter=0'
# html = asyncio.get_event_loop().run_until_complete(main(url, cookie))
loop1 = asyncio.new_event_loop()
asyncio.set_event_loop(loop1)
loop = asyncio.get_event_loop()
html = loop.run_until_complete(main(url, cookie))
# loop1.close()
if ' and not a robot' in html: #Google
# print(html)
return False
return html
# 本地文件测试
if __name__ == '__main__':
url = 'https://www.google.com/search?q=site:baidu.com&btnG=Search&hl=en-US&biw=&bih=&gbv=1&start=1&filter=0'
data = get_pyppeteer(url)
print(data)
7、Python专属终端开发工具Rich
Rich 是一个 Python 库,可以为你在终端中提供富文本和漂亮、精美的格式。
使用 Rich API 可以很容易的在终端输出添加各种颜色和不同风格。它可以绘制漂亮的表格,进度条,Markdown,突出显示语法的源代码及回溯等等,优秀的功能不胜枚举。