Python爬虫

urllib 

https协议相对于http更安全,所以采用https遇到了UA反爬

get请求方式-quote方法(字符串转化为Unicode编码)

 get请求方式-urlencode(相当于quote的进阶版,可将多个字符串转化为Unicode编码)

post请求方式(post请求方式中data编码之后要调用encode方法进行二次编码)

ajax的get请求方式-爬取豆瓣电影第一页

ajax的post请求方式-爬取肯德基地址前十页 

import urllib.request
import urllib.parse

def create_request(page):
    base_url = 'https://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'
    data = {
        'cname': '郑州',
        'pid': '',
        'pageIndex': page,
        'pageSize': '10'
}
    data = urllib.parse.urlencode(data).encode('utf-8')
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
    }
    request = urllib.request.Request(url=base_url,data=data,headers=headers)
    return request

def get_content(request):
    response = urllib.request.urlopen(request)
    content = response.read().decode('utf-8')
    return content

def down_load(page, content):
    with open('kfc_' + str(page) + '.json', 'w', encoding='utf-8')as fp:
        fp.write(content)

if __name__ == '__main__':
    start_page = int(input("请输入起始页码"))
    end_page = int(input("请输入结束页码"))
    for page in range(start_page, end_page + 1):
        # 请求对象的定制
        request = create_request(page)
        # 获取网页源码
        content = get_content(request)
        # 下载
        down_load(page, content)

urllib异常(urllib.error包括HTTPError,URLError)

 urllib的handler处理器的基本使用

urllib代理

解析_xpath

解析_xpath解析百度网站百度一下

解析_站长资源情侣图片

import urllib.request
from lxml import etree

# url='https://sc.chinaz.com/tupian/qinglvtupian.html'
#     https://sc.chinaz.com/tupian/qinglvtupian_2.html
#     https://sc.chinaz.com/tupian/qinglvtupian_3.html
def create_request(page):
    if page == 1:
        url = 'https://sc.chinaz.com/tupian/qinglvtupian.html'
    else:
        url = 'https://sc.chinaz.com/tupian/qinglvtupian_' + str(page) + '.html'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
    }
    request = urllib.request.Request(url=url, headers=headers)
    return request

def get_content(request):
    response = urllib.request.urlopen(request)
    content = response.read().decode('utf-8')
    return content

def down_load(content):
    tree = etree.HTML(content)
    # 获取所有图片名
    name_list = tree.xpath('//div[@class="container"]//img/@alt')
    # 获取所有的图片路径,由于图片是懒下载,所以路径是缓冲前的路径
    src_list = tree.xpath('//div[@class="container"]//img/@data-original')
    for i in range(len(name_list)):
        name = name_list[i]
        src = src_list[i]
        url = 'https:' + src
        urllib.request.urlretrieve(url=url,filename='./imgPacket/' + name + '.jpg')

if __name__ == '__main__':
    start_page = int(input('请输入起始页码'))
    end_page = int(input('请输入结束页码'))
    for page in range(start_page, end_page + 1):
        # 1、请求对象定制
        request = create_request(page)
        # 2、获取网页源码
        content = get_content(request)
        # 3、下载
        down_load(content)

jsonpath

JSONPath-简单入门_jsonpath简单入门-CSDN博客icon-default.png?t=N7T8https://blog.csdn.net/luxideyao/article/details/77802389jsonpath解析淘票票-获取地址

import urllib.request
import json
import jsonpath

url = 'https://dianying.taobao.com/cityAction.json?activityId&_ksTS=1708259958714_108&jsoncallback=jsonp109&action=cityAction&n_s=new&event_submit_doGetAllRegion=true'
headers = {
    'Accept':'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01',
    'Accept-Language':'zh-CN,zh;q=0.9',
    'Bx-V':'2.5.11',
    'Cookie':'t=05c9fa0d904918b7a013f30a9deb02f0; cookie2=12da420ef2dfbeeeca0a8d50029633f0; v=0; _tb_token_=7863eb1ee79e3; cna=XehYHv4X3EsCAQHGFQJeUH9o; xlly_s=1; tfstk=eegW3nDqaTXSKIjlhLOqGOLxeSzBgYTNd6NKsXQP9zU8OeME3_lrT4ob9vkDtQ4r9JMI3-DQxQ2UpJME90R27FloZy4LRd8w7PVYFGJNMrRv3b43JK5VJh8sZASC_MSvHEXOuFcwhZQfJFwd4V9Y-yubNoKqa-e4nBP8Val7Hwptv760BbefJwIyUNys8PjCGlbQGRRXGMjhSL_dKKOP5Yq8iSuwGI65xuF0GRRXGMj32SV4bIOfFMf..; isg=BCEhH8xc4CHVhEzVCfuM7FwuMO07zpXAYNTEEYP28ygU6kC8yRsxkgFkTB7sIi34',
    'Referer':'https://dianying.taobao.com/',
    'Sec-Ch-Ua':'"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
    'Sec-Ch-Ua-Mobile':'?0',
    'Sec-Ch-Ua-Platform':'"Windows"',
    'Sec-Fetch-Dest':'empty',
    'Sec-Fetch-Mode':'cors',
    'Sec-Fetch-Site':'same-origin',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
    'X-Requested-With':'XMLHttpRequest'
}
# 请求对象定制
request = urllib.request.Request(url=url,headers=headers)
# 模拟浏览器访问服务器
response = urllib.request.urlopen(request)
# 获取网页源码
content = response.read().decode('utf-8')
# split分隔
content = content.split('(')[1].split(')')[0]
# 由于jsonpath只能接收本地,需要将json数据吸入本地
with open('./解析_淘票票.json', 'w', encoding='utf-8')as fp:
    fp.write(content)

obj = json.load(open('./解析_淘票票.json', 'r', encoding='utf-8'))
city_list = jsonpath.jsonpath(obj, '$..regionName')
print(city_list)

bs4

注:如果标签对象中,只有内容,那么string和get_text()都可行。反之,如果标签中除了内容还有其他标签,那么只有get_text()可行,string不可行

 

bs4使用方法 

bs4解析_爬取星巴克菜单数据 

selenium

selenium是阻止反爬的手段,爬虫中对于模拟浏览器向服务器发送请求,对方服务器收到响应后,不给予网站的所有数据,因此需要利用selenium从而得到更多网站数据

selenium-京东秒杀

selenium元素定位

selenium访问元素信息

selenium-handless使用 

requests的基本使用

requests的get请求

requests的post请求

scrapy

 

  • 6
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值