requests库抓取数据
URL(uniform resource locator ,统一资源定位符),由协议+域名+路径+参数组成
HTTP请求格式(请求行,请求头,空行,请求体)
常见请求方法:
1.GET
import requests
def get_one_img():
#url="https://p1.meituan.net/movie/6bea9af4524dfbd0b668eaa7e187c3df767253.jpg@464w_644h_1e_1c"
url = "https://p1.meituan.net/movie/b607fba7513e7f15eab170aac1e1400d878112.jpg@464w_644h_1e_1c"
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67"
}
#功能 参数 返回值
res=requests.get(url,headers=headers)
print(res)
with open("kill.jpg",'wb') as f:
f.write(res.content)
#get_one_img()
def get_img(url, name):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67"}
# 功能 参数 返回值
res = requests.get(url, headers=headers)
# print(res)
img_name = f"{name}.jpg"
with open("img_name", 'wb') as f:
f.write(res.content)
url = "https://p0.meituan.net/movie/283292171619cdfd5b240c8fd093f1eb255670.jpg@464w_644h_1e_1c"
name = '2'
get_img(url,name)
2.POST
正则:利用文字结构
接下来是利用requests库抓取图片的代码
import re
import requests
def get_one_img():
#url="https://p1.meituan.net/movie/6bea9af4524dfbd0b668eaa7e187c3df767253.jpg@464w_644h_1e_1c"
url = "https://p1.meituan.net/movie/b607fba7513e7f15eab170aac1e1400d878112.jpg@464w_644h_1e_1c"
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67"
}
#功能 参数 返回值
res=requests.get(url,headers=headers)
#print(res)
with open("boat.jpg",'wb') as f:
f.write(res.content)
get_one_img()
def get_img(url,name):
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67"}
res=requests.get(url,headers=headers)
img_name=f"{name}.jpg"
with open(img_name,'wb') as f:
f.write(res.content)
# url="https://p0.meituan.net/movie/da64660f82b98cdc1b8a3804e69609e041108.jpg@464w_644h_1e_1c"
# name='1'
# get_img(url,name)
def get_urls():
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67"
}
#接下来不在手动,直接向网页发起请求抓取
res=requests.get("https://ssr1.scrape.center/",headers=headers)
text=res.text#变成文本
urls=re.findall('src="(https.*?)"',text)
return urls
urls=get_urls()
for index,url in enumerate(urls):
get_img(url,index)
接下来是利用requests库抓取文本信息
注意:比较重要的文本信息需要提取
xpath:利用网页结构