一、基本认识
-
1、发送一个
get
请求import requests if __name__ == "__main__": # 获取一个get请求 response = requests.get('http://httpbin.org/get')
-
2、关于获取请求到数据常见的返回值
import requests if __name__ == "__main__": # 获取一个get请求 response = requests.get('http://httpbin.org/get') # 对抓取的网站设置编码 response.encoding = 'utf-8' # 打印返回的数据 print(response.text) print(response.json()) print(response.headers) print(response.status_code) print(response.url) print(response.cookies) print(response.json()) # 获取最原始的字符串,没有编码的(用户response.text出现乱码的时候,及下载二进制文件的时候) print(response.content)
-
3、关于其他的请求方式
response = requests.post('http://httpbin.org/post') response = requests.put('http://httpbin.org/put') response = requests.delete('http://httpbin.org/delete') response = requests.head('http://httpbin.org/get') response = requests.options('http://httpbin.org/get')
二、关于get
请求传递参数的
-
1、直接在
url
地址后面拼接参数import requests if __name__ == "__main__": # 定义一个请求头(模拟浏览器) headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'} # 设置参数 data = {'name': 'june', 'password': 123456} # 获取一个get请求 response = requests.get('http://httpbin.org/get?name=june&password=123456', headers=headers) # 对抓取的网站设置编码 response.encoding = 'utf-8' print(response.text)
-
2、使用
params
传递参数import requests if __name__ == "__main__": # 定义一个请求头(模拟浏览器) headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'} # 设置参数 data = {'name': 'june', 'password': 123456} # 获取一个get请求 response = requests.get('http://httpbin.org/get', headers=headers, params=data) # 对抓取的网站设置编码 response.encoding = 'utf-8' print(response.text)
三、使用requests
库和正则表达式下载文章内容
-
1、需要下载的伯乐在线的文章标题
-
2、书写逻辑代码
import re import requests if __name__ == "__main__": headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36', } url = 'http://python.jobbole.com/category/guide/' response = requests.get(url=url, headers=headers) pattern = re.compile( '<div.*?post-thumb.*?title="(.*?)".*?</a>', re.S ) print(response.status_code) result_list = re.findall(pattern, response.text) f = open('jobbole1.txt', 'a+', encoding='utf8') for item in result_list: f.write(item.strip() + '\n') f.close()
-
3、解说正则表达式
- .*?表示非贪婪的匹配任何字符
- re.S 使.匹配包括换行在内的全部字符
四、使用requests
库和正则表达式下载图片
-
1、导包
import re import os import shutil import requests
-
2、定义一个下载图片的类
class DownPic(object): def __init__(self): self.url = 'http://python.jobbole.com/category/guide/' self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36', } self.create_dir() def create_dir(cls): # 如果文件夹存在就删除 if os.path.exists('demo'): shutil.rmtree('demo') os.makedirs('demo') def get_html(self): response = requests.get(url=self.url, headers=self.headers) return response.text def pattern(self): pattern = re.compile( '<div.*?post-thumb.*?src="(.*?)".*?</a>', re.S ) result_list = re.findall(pattern, self.get_html()) return result_list def download(self): for item in self.pattern(): # 获取到的图片地址再次请求 if item.rsplit('.')[-1] in ['png', 'jpg']: resp = requests.get(item.strip()) try: with open(os.path.join('demo', item.strip().rsplit("/")[-1]), 'wb') as f: f.write(resp.content) except Exception as e: print(e) else: continue
-
3、调用
if __name__ == "__main__": p = DownPic() p.download()
五、关于requests
库的post
请求
-
1、格式
response = requests.post('http://httpbin.org/post', headers=headers, data=data)
-
2、发送数据到服务器端
import requests if __name__ == "__main__": # 定义一个请求头(模拟浏览器) headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'} # 设置参数 data = {'email': '22@qq.com', 'password': 123456} # 获取一个get请求 response = requests.post('https://httpbin.org/post', headers=headers, data=data) # 对抓取的网站设置编码 response.encoding = 'utf-8' print(response.text)
六、使用post
请求获取拉勾网职业信息
import requests
if __name__ == "__main__":
url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36',
'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
}
data = {
'first': 'true',
'pn': '1',
'kd': 'python',
}
response = requests.post(url=url, headers=headers, data=data)
print(response.json())