urllib
https协议相对于http更安全,所以采用https遇到了UA反爬
get请求方式-quote方法(字符串转化为Unicode编码)
get请求方式-urlencode(相当于quote的进阶版,可将多个字符串转化为Unicode编码)
post请求方式(post请求方式中data编码之后要调用encode方法进行二次编码)
ajax的get请求方式-爬取豆瓣电影第一页
ajax的post请求方式-爬取肯德基地址前十页
import urllib.request
import urllib.parse
def create_request(page):
base_url = 'https://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'
data = {
'cname': '郑州',
'pid': '',
'pageIndex': page,
'pageSize': '10'
}
data = urllib.parse.urlencode(data).encode('utf-8')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
}
request = urllib.request.Request(url=base_url,data=data,headers=headers)
return request
def get_content(request):
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
return content
def down_load(page, content):
with open('kfc_' + str(page) + '.json', 'w', encoding='utf-8')as fp:
fp.write(content)
if __name__ == '__main__':
start_page = int(input("请输入起始页码"))
end_page = int(input("请输入结束页码"))
for page in range(start_page, end_page + 1):
# 请求对象的定制
request = create_request(page)
# 获取网页源码
content = get_content(request)
# 下载
down_load(page, content)
urllib异常(urllib.error包括HTTPError,URLError)
urllib的handler处理器的基本使用
urllib代理
解析_xpath
解析_xpath解析百度网站百度一下
解析_站长资源情侣图片
import urllib.request
from lxml import etree
# url='https://sc.chinaz.com/tupian/qinglvtupian.html'
# https://sc.chinaz.com/tupian/qinglvtupian_2.html
# https://sc.chinaz.com/tupian/qinglvtupian_3.html
def create_request(page):
if page == 1:
url = 'https://sc.chinaz.com/tupian/qinglvtupian.html'
else:
url = 'https://sc.chinaz.com/tupian/qinglvtupian_' + str(page) + '.html'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
}
request = urllib.request.Request(url=url, headers=headers)
return request
def get_content(request):
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
return content
def down_load(content):
tree = etree.HTML(content)
# 获取所有图片名
name_list = tree.xpath('//div[@class="container"]//img/@alt')
# 获取所有的图片路径,由于图片是懒下载,所以路径是缓冲前的路径
src_list = tree.xpath('//div[@class="container"]//img/@data-original')
for i in range(len(name_list)):
name = name_list[i]
src = src_list[i]
url = 'https:' + src
urllib.request.urlretrieve(url=url,filename='./imgPacket/' + name + '.jpg')
if __name__ == '__main__':
start_page = int(input('请输入起始页码'))
end_page = int(input('请输入结束页码'))
for page in range(start_page, end_page + 1):
# 1、请求对象定制
request = create_request(page)
# 2、获取网页源码
content = get_content(request)
# 3、下载
down_load(content)
jsonpath
JSONPath-简单入门_jsonpath简单入门-CSDN博客https://blog.csdn.net/luxideyao/article/details/77802389jsonpath解析淘票票-获取地址
import urllib.request
import json
import jsonpath
url = 'https://dianying.taobao.com/cityAction.json?activityId&_ksTS=1708259958714_108&jsoncallback=jsonp109&action=cityAction&n_s=new&event_submit_doGetAllRegion=true'
headers = {
'Accept':'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01',
'Accept-Language':'zh-CN,zh;q=0.9',
'Bx-V':'2.5.11',
'Cookie':'t=05c9fa0d904918b7a013f30a9deb02f0; cookie2=12da420ef2dfbeeeca0a8d50029633f0; v=0; _tb_token_=7863eb1ee79e3; cna=XehYHv4X3EsCAQHGFQJeUH9o; xlly_s=1; tfstk=eegW3nDqaTXSKIjlhLOqGOLxeSzBgYTNd6NKsXQP9zU8OeME3_lrT4ob9vkDtQ4r9JMI3-DQxQ2UpJME90R27FloZy4LRd8w7PVYFGJNMrRv3b43JK5VJh8sZASC_MSvHEXOuFcwhZQfJFwd4V9Y-yubNoKqa-e4nBP8Val7Hwptv760BbefJwIyUNys8PjCGlbQGRRXGMjhSL_dKKOP5Yq8iSuwGI65xuF0GRRXGMj32SV4bIOfFMf..; isg=BCEhH8xc4CHVhEzVCfuM7FwuMO07zpXAYNTEEYP28ygU6kC8yRsxkgFkTB7sIi34',
'Referer':'https://dianying.taobao.com/',
'Sec-Ch-Ua':'"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
'Sec-Ch-Ua-Mobile':'?0',
'Sec-Ch-Ua-Platform':'"Windows"',
'Sec-Fetch-Dest':'empty',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Site':'same-origin',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
'X-Requested-With':'XMLHttpRequest'
}
# 请求对象定制
request = urllib.request.Request(url=url,headers=headers)
# 模拟浏览器访问服务器
response = urllib.request.urlopen(request)
# 获取网页源码
content = response.read().decode('utf-8')
# split分隔
content = content.split('(')[1].split(')')[0]
# 由于jsonpath只能接收本地,需要将json数据吸入本地
with open('./解析_淘票票.json', 'w', encoding='utf-8')as fp:
fp.write(content)
obj = json.load(open('./解析_淘票票.json', 'r', encoding='utf-8'))
city_list = jsonpath.jsonpath(obj, '$..regionName')
print(city_list)
bs4
注:如果标签对象中,只有内容,那么string和get_text()都可行。反之,如果标签中除了内容还有其他标签,那么只有get_text()可行,string不可行
bs4使用方法
bs4解析_爬取星巴克菜单数据
selenium
selenium是阻止反爬的手段,爬虫中对于模拟浏览器向服务器发送请求,对方服务器收到响应后,不给予网站的所有数据,因此需要利用selenium从而得到更多网站数据
selenium-京东秒杀
selenium元素定位
selenium访问元素信息
selenium-handless使用
requests的基本使用
requests的get请求
requests的post请求
scrapy