Python爬虫入门

最新推荐文章于 2023-10-17 16:13:40 发布

jackson-rick

最新推荐文章于 2023-10-17 16:13:40 发布

阅读量156

点赞数

分类专栏： Python 文章标签：爬虫 requests lxml

本文链接：https://blog.csdn.net/xujunfei520/article/details/102851501

版权

Python 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

爬虫

requests

get

url = ''
resp = requests.get(url, verify=False)
page_source = resp.content.decode('utf-8')

post

url = ''
data = {
    '': ''
}
header = {
    'Content-Type': 'application/x-www-form-urlencoded',
    'X-Requested-With': 'XMLHttpRequest'
}
resp = requests.post(url, headers=header, data=data, cookies=resp.cookies, verify=False)
resp_json = json.loads(resp.content.decode('utf-8'))

session

ck = 'a=b'
session = requests.Session()
resp = session.post(url, data=data, cookies=str_to_cookie(ck), verify=False)
resp2 = session.post(url2, data=data, verify=False)

cookie工具类

def cookie_to_str(cookies):
    """cookie对象转字符"""
    cookie_str = ''
    for s in cookies:
        cookie_str += s.name + "=" + s.value + ';'
    return cookie_str


def dict_to_str(cookies):
    """cookie dict转字符"""
    cookie_str = ''
    for x, y in cookies.items():
        cookie_str += x + "=" + y + ';'
    return cookie_str


def str_to_cookie(cookies):
    """字符转cookie字典"""
    dict = {}
    cks = cookies.split(';')
    for ck in cks:
        ck_kv = ck.split('=')
        if len(ck_kv) == 2:
            dict[ck_kv[0]] = ck_kv[1]
    return dict

xpath

pip install lxml

html = etree.HTML(page_source)
html_data = html.xpath('/html/body/div/ul/li/a')
for i in html_data:
    print(i.text)

// 相对路径 / 绝对路径

例子
.xpath('//li/a/text()')
.xpath('//li/a//@href')
.xpath('//li/a[@href="link.html"]')
.xpath('//li[last()]/a/text()')
.xpath('//a[contains(@href, "link")]')
.xpath('//a[re:test(@id, "i\d+")]/text()')