1. requests登录
import requests
# requests自动登录步骤
# 第一步:人工对需要自动登录网页进行登录
# 第二步:获取这个网站登录后的cookie信息
# 第三步:发送请求信息的时候在请求头中添加cookie值
headers = {
'cookie': '_zap=419d047d-8bc0-4ade-a1bf-7f23f64e4c17; d_c0=AGBWipIMjhaPTlzKnPcBYqOBj91ms1kb21w=|1680269876; _xsrf=234dfec3-cd21-48ea-9b39-3e6c5fab595e; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1680269878,1680318866; captcha_session_v2=2|1:0|10:1680318867|18:captcha_session_v2|88:M0FGVTRFOXU4akdPOHJ3Uy9neHc4TUhqZlVUYUIxOFVFVFZid3hhdGl4enJtZmpabGdoOU5BUDlYMnlLdzFFKw==|a2195fa485379e7a41d37b049b9f6dbe882dcfef6ff2e272c0c83f79b0297b5f; SESSIONID=bkZKbUZb9cWDK31z16gc3P9wrgpEoxLOTfXxBSSI1Kx; __snaker__id=3Vt06mcnvlwVqUuj; JOID=VVETAEzn9KDdRFn2S-PKuTzRUXZci5D_kiYCoyzemficC2rJPnVfhr1KWPxKi1p0MO_BZPB9O065RXy5TmqukJQ=; osd=UVoSBE_j_6HZR139SufJvTfQVXVYgJH7kSIJoijdnfOdD2nNNXRbhblBWfhJj1F1NOzFb_F5OEqyRHi6SmGvlJc=; gdxidpyhxdE=n10HU5m0CbqAcdr8xWHoJNbE0vn+E4SSXyiSw109W8Xr3o7BvRfKj/EY3d+vC3csTcQvx+6e5lZD2oTpTjQy5oRcvqUtU59O+ykZv3THlZDfgHs8Rifr844moDjZC76xZuAp9ZU05BaH0n4rVPrgJX9QcGqatDU1Q1/1dSkmkOg8Pqos:1680319767241; YD00517437729195:WM_NI=Xbx9QrAECV1VKTNGbXUAhQtk7No8KRddJbKO/byUb8rZ3wTcST02LGvydf4qOfrerCFXCgGivkPTZMveAahXAkfTPq1bVq7/AuxCJmutAEMfFPuF9vYPKTSM5uQOAkmcYWQ=; YD00517437729195:WM_NIKE=9ca17ae2e6ffcda170e2e6ee93ca72b5afe1b5e73bae9a8ea2c45a878b9badd145b0aa8794d3338bb3bb84e72af0fea7c3b92a9cbeb8dad47a839b8db7b45e898c82a8e154979fa8b2f761f592a1d9ca5aa8bc8683ca45edbeb7b6e134b286fd98d769918c00d3cf4db6a7ac87b77b8dbabe86f14986888d88c663b6b5a0b5d947b6bb8f82eb5483b5aab5d37bf3be9a92b62583abb8a4c967b7ac8fa6d06e8690bba7bc7e9b9085b5d247f7ab9fb0c461afbd97b8d437e2a3; YD00517437729195:WM_TID=B5L63CsgbwZFRERAUFeUOlrZsKtIyuNc; captcha_ticket_v2=2|1:0|10:1680318873|17:captcha_ticket_v2|704:eyJ2YWxpZGF0ZSI6IkNOMzFfcFhiMU91dlJNN2ktQTlyLjY4Z1pkcndINUNZYnZyemVKY3gyVzZ4V0luaURoYVpBaE1ydlNac3A0OWxQalNPbEpRSHl4QzlLTml1SzJWNFNfNm5US2JBa1lLLlVTOWdsejlxSFR2bXJvMlpoTkI4MnpnYkNERFlyaThXVk1aWTFCQ2NDRzlHd0l0LV83c1lIeWl0ZE1jY0t5Z3ZLbzI3Wi5rbFJhRVpTd1VEcEFNS2FPaFRkdVhleEJNY1hXOFdCaXZkY0lXY19aMllBZ2ZIV1FtOTF6WHdzelR4MkFtT19kS0ZpSmlNMTVDMUFOTDVTT1dzNUtZWVdTZ1BhME11SlM1QVU4X29JX3ktRFdwQmV2NkNlcWZKdzJIVWp5UkpfVnBoRi5SLkJWSVdUdWRBUkNRVVRBRHZBRDRjckRreGkyR2ZJYS5VVy5xMFN6TE5fdTRkOVhNYUlJOTFPbFRBVkpaX3NvbW1QZFJvbnhFNk1JMXV1T2ZEeWtpdXlGTkxpQlV3OUpYN3Z3aVNBYXBpc0tya2NFbHdacEtwbVBzNUhBVW43SEJYRHc3QWgyU0ZZeXg1RWJpMk4uWnRzVldoTC1PUEdka00yb1lac0JsVTVjSy5Fb0FnZGhsR25ub2pQUVRWNW9hVndhSEdXMjB0dTA5bnd4ZUNybU1yMyJ9|e8442557bb3a316eb50b0737681d5ad1d38f99569b680d3030222c85f91ee04e; z_c0=2|1:0|10:1680318914|4:z_c0|92:Mi4xSzNfM0RBQUFBQUFBWUZhS2tneU9GaVlBQUFCZ0FsVk53dThVWlFEd3JoSmRuV0l0bEE5aTVJWG1QemNYSWhhbVR3|0a6e5fa18bff104e4a0644fb23ce144606868fb05bb78b7170206acf203b9cb6; q_c1=27f0ae6c555b4de4a8ff458f17bb3ade|1680318915000|1680318915000; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1680318916; tst=r; KLBRSID=e42bab774ac0012482937540873c03cf|1680318918|1680318866',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
}
response = requests.get(r'https://www.zhihu.com/', headers=headers)
print(response.text)
2. selenium登录
from selenium.webdriver import Chrome
# 1. 创建浏览器打开需要自动登录的网页
b = Chrome()
b.get('https://www.taobao.com/')
# 2. 留足够长的时候人工完成登录(必须保证b指向的窗口的网页能看到登录已婚的信息)
input('是否已经完成登录:')
# 获取登录成功后的cookie信息,保存到本地文件
result = b.get_cookies()
with open(r'file/cookie.txt', 'w', encoding='utf-8') as f:
f.write(str(result))
3. selenium使用cookie
from selenium.webdriver import Chrome
# 1. 创建浏览器打开需要自动登录的网页
b = Chrome()
b.get('https://www.taobao.com/')
# 2. 获取本地保存的cookie
with open(r'file/cookie.txt', encoding='utf-8') as f:
result = eval(f.read())
# 3. 添加cookie
for x in result:
b.add_cookie(x)
# 4. 重新打开网页
b.get('https://www.taobao.com/')
input('是否已经完成登录:')
4. selenium代理
from selenium.webdriver import Chrome, ChromeOptions
# 设置代理
options = ChromeOptions()
options.add_argument('--proxy-server=http://121.61.160.216:4583')
b = Chrome(options=options)
b.get('https://movie.douban.com/top250?start=0&filter=')
5. xpath用法
# xpath用来解析网页数据或者xml数据的一种解析方法,它是通过路径来获取标签(元素)
'''
Python数据:{'name': 'xiaoming', 'age': 18, 'is_ad': True, 'car_no': None}
Json数据:{"name": "xiaoming", "age": 18, "is_ad": True, "car_no": nu;;}
xml数据:
<allStudent>
<student class="优秀学员">
<is_ad>是</is_ad>
<car_no></car_no>
</student>
<student class="优秀学员">
<name>xiaoming</name>
<age>18</age>
<is_ad>是</is_ad>
<car_no></car_no>
</student>
</allStudent>
'''
-
常见的几个概念
- 树:整个网页结构和xml结构就是一个树结构
- 元素(节点):html树结构的每个标签
- 根节点:树结构中的第一个节点
- 内容:标签内容
- 属性:标签属性
-
Xpath语法
-
获取标签
-
绝对路径:以’/'开头,然后从根节点开始层层往下写路径
-
相对路径:写路径的时候用’.‘或者’…‘开头,其中’.‘表示当前节点,’…'表示当前节点的父节点
注意:如果路径以’./‘开头,’./'可以省略
-
全路径:以’//'开头的路径
-
-
获取标签内容:在获取标签的路径的最后加’/text()’
-
获取标签属性:在获取标签的路径的最后加@属性名
-
# 应用
from lxml import etree
# 1. 创建数结构,获取根节点
html = open('file/data.html', encoding='utf-8').read()
root = etree.HTML(html)
# 2. 通过路径获取标签
# 节点对象.xpath(路径) - 根据路径获取所有的标签,返回值是列表,列表中的元素是节点对象
result = root.xpath('/html/body/div/a')
print(result) # [<Element a at 0x20ec5403e80>, <Element a at 0x20ec5403ec0>]
# 获取标签内容
result = root.xpath('/html/body/div/a/text()')
print(result) # ['我是超链接2', '我是超链接4']
# 获取标签属性
result = root.xpath('/html/body/div/a/@href')
print(result) # ['https://www.baidu.com', 'https://www.taobao.com']
# 1) 绝对路径的写法跟xpath前面用谁去点无关
div = root.xpath('/html/body/div')[0]
result = div.xpath('/html/body/div/a/text()')
print(result) # ['我是超链接2', '我是超链接4']
# 2) 相对路径
result = root.xpath('./body/div/a/text()')
print(result) # ['我是超链接2', '我是超链接4']
result = div.xpath('./a/text()')
print(result) # ['我是超链接2', '我是超链接4']
result = div.xpath('a/text()')
print(result) # ['我是超链接2', '我是超链接4']
# 3) 全路径
result = root.xpath('//a/text()')
print(result) # ['我是超连接1', '我是超链接2', '我是超链接4', '我超链接3']
result = root.xpath('//div/a/text()')
print(result) # ['我是超链接2', '我是超链接4', '我超链接3']
-
加谓语(加条件) - 路径中的节点[]
-
位置相关谓语
[N] - 第N个指定标签
[last()] - 最后一个指定标签
[position>N]、[position<N]、[position>=N]、[position<=N]
-
属性相关谓语
[@属性名=属性值]
-
result = root.xpath('//span/p[2]/text()')
print(result)
result = root.xpath('//span/p[last()]/text()')
print(result)
result = root.xpath('//span/p[position<=2]/text()')
print(result)
result = root.xpath('//span/p[last()-1]/text()')
print(result)
result = root.xpath('//span/p[@id="p1"]/text()')
print(result)
result = root.xpath('//span/p[@class="c1"]/text()')
print(result)
result = root.xpath('//span/p[@data="5"]/text()')
print(result)
-
通配符
在xpath中可以通过*来表示任意标签任意属性
result = root.xpath('//span/*/text()')
print(result)
result = root.xpath('//span/*[@class="c1"]/text()')
print(result)
result = root.xpath('//span/span/@*')
print(result)
result = root.xpath('//*[@class="c1"]/text()')
print(result)