Python爬虫入门

爬虫

requests

get

url = ''
resp = requests.get(url, verify=False)
page_source = resp.content.decode('utf-8')

post

url = ''
data = {
    '': ''
}
header = {
    'Content-Type': 'application/x-www-form-urlencoded',
    'X-Requested-With': 'XMLHttpRequest'
}
resp = requests.post(url, headers=header, data=data, cookies=resp.cookies, verify=False)
resp_json = json.loads(resp.content.decode('utf-8'))

session

ck = 'a=b'
session = requests.Session()
resp = session.post(url, data=data, cookies=str_to_cookie(ck), verify=False)
resp2 = session.post(url2, data=data, verify=False)

cookie工具类

def cookie_to_str(cookies):
    """cookie对象转字符"""
    cookie_str = ''
    for s in cookies:
        cookie_str += s.name + "=" + s.value + ';'
    return cookie_str


def dict_to_str(cookies):
    """cookie dict转字符"""
    cookie_str = ''
    for x, y in cookies.items():
        cookie_str += x + "=" + y + ';'
    return cookie_str


def str_to_cookie(cookies):
    """字符转cookie字典"""
    dict = {}
    cks = cookies.split(';')
    for ck in cks:
        ck_kv = ck.split('=')
        if len(ck_kv) == 2:
            dict[ck_kv[0]] = ck_kv[1]
    return dict

xpath

pip install lxml

html = etree.HTML(page_source)
html_data = html.xpath('/html/body/div/ul/li/a')
for i in html_data:
    print(i.text)

// 相对路径 / 绝对路径

例子
.xpath('//li/a/text()')
.xpath('//li/a//@href')
.xpath('//li/a[@href="link.html"]')
.xpath('//li[last()]/a/text()')
.xpath('//a[contains(@href, "link")]')
.xpath('//a[re:test(@id, "i\d+")]/text()')

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值