爬虫
requests
get
url = ''
resp = requests.get(url, verify=False)
page_source = resp.content.decode('utf-8')
post
url = ''
data = {
'': ''
}
header = {
'Content-Type': 'application/x-www-form-urlencoded',
'X-Requested-With': 'XMLHttpRequest'
}
resp = requests.post(url, headers=header, data=data, cookies=resp.cookies, verify=False)
resp_json = json.loads(resp.content.decode('utf-8'))
session
ck = 'a=b'
session = requests.Session()
resp = session.post(url, data=data, cookies=str_to_cookie(ck), verify=False)
resp2 = session.post(url2, data=data, verify=False)
cookie工具类
def cookie_to_str(cookies):
"""cookie对象转字符"""
cookie_str = ''
for s in cookies:
cookie_str += s.name + "=" + s.value + ';'
return cookie_str
def dict_to_str(cookies):
"""cookie dict转字符"""
cookie_str = ''
for x, y in cookies.items():
cookie_str += x + "=" + y + ';'
return cookie_str
def str_to_cookie(cookies):
"""字符转cookie字典"""
dict = {}
cks = cookies.split(';')
for ck in cks:
ck_kv = ck.split('=')
if len(ck_kv) == 2:
dict[ck_kv[0]] = ck_kv[1]
return dict
xpath
pip install lxml
html = etree.HTML(page_source)
html_data = html.xpath('/html/body/div/ul/li/a')
for i in html_data:
print(i.text)
// 相对路径 / 绝对路径
例子
.xpath('//li/a/text()')
.xpath('//li/a//@href')
.xpath('//li/a[@href="link.html"]')
.xpath('//li[last()]/a/text()')
.xpath('//a[contains(@href, "link")]')
.xpath('//a[re:test(@id, "i\d+")]/text()')