首先要用anaconda 的pip比较好 , pip install requests
where pip 查看pip 是谁的,如果是C:\Anaconda3\Scripts\ip.exe就不用配置了
如果不是就要去环境变量配置
get 请求 确定找的url是get请求,获取的是请求的页面内容《html》
class Guba:
def __init__(self,base_url):
self.base_url = base_url
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
self.gb = 'gb/' #后面还需要+页码
self.params()
def mkdir(self,filename,file):
with open(filename,'w',encoding='utf-8') as h:
h.write(file)
def params(self):
if not os.path.exists(self.gb):
os.mkdir(self.gb)
for i in range(1,13):
self.base_url = r'http://guba.eastmoney.com/default,99_'+str(i)+'.html'
#发送请求,获取响应
response = requests.get(url=self.base_url, headers=self.headers)
#写入文件夹
filename = self.gb + str(i) + '.html'
self.mkdir(filename,response.text)
if __name__ == '__main__':
base_url = r'http://guba.eastmoney.com' #http://guba.eastmoney.com/default,99_2.html ,12页,第一页为0
Guba(base_url)
2、确定基础的url
base_url = '基础的url'
3.发起请求,获取响应
resposne = requests.get(base_url)
4.get方法的参数
response = requests.get(
url = 请求的url,
headers = 请求头字典,
params= '请求参数’,
timeout=‘超时时长’
)
5、对相应内容解码,两种方法
第一种写法: response.content.decode(‘网页的编码方式’)
乱码的第二种解决方法:
response.text #会自动识别编码方式进行解码,但是一般会解码失败
解决:response.encoding设置正确的编码格式。
request会自动解码来自服务器的内容.
(3)获取响应json内容。response.json()
(4) response.status_code :获取状态码
(5)response.url:获取请求的url
(6)response.headers
查看页面内容的编码格式
headers一般有如下参数
params 和 referer有些网页有有些没有 , 没有的话在request 时就不用写
分页爬取是要看url改变的规律,根据规律编写代码。就能获取到每个页码页面。
post请求
import requests
class Jinshan:
def __init__(self,url,value):
self.url = url
self.value = value
self.paqu()
def paqu(self):
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Host': 'fy.iciba.com',
'Origin': 'http://fy.iciba.com',
'Referer': 'http://fy.iciba.com/?trans=search',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'Cookie': 'iciba_u_rand=fe4a0d06c0f6cfd8c53198ecbe2f6003%4027.155.93.77; iciba_u_rand_t=1569418625; UM_distinctid=16d68a3bdfd30d-0b86dbdbb7a17f-3f385804-15f900-16d68a3bdfe519; CNZZDATA1256573702=923384239-1569417092-http%253A%252F%252Fwww.iciba.com%252F%7C1569417092; __gads=ID=50204955a7618406:T=1569418689:S=ALNI_MatP-KRdyANQujH-P8AafSftbL3Tg; kdund25=y',
'Content - Length': str(len(self.value)),
'Content - Type': 'application / x - www - form - urlencoded;charset = UTF - 8',
}
data = {
'f': 'auto',
't': 'auto',
'w': self.value
}
response = requests.post(url=self.url,headers=headers,data=data)
print(response.json()['content']['word_mean']) #返回是很多内容的字典,取出想要的
if __name__ == '__main__':
url = 'http://fy.iciba.com/ajax.php?a=fy'
value = 'python'
Jinshan(url,value)
当请求数据变化,请求头headers中如果有一些也跟着变化就要破解变化的规律,然后放在headers中发送请求
例子:
class Youdao:
def __init__(self,base_url,value):
self.url = base_url
self.value = value
self.paqu()
def get_md5(self,content):
md5 = hashlib.md5()
md5.update(content.encode('utf-8'))
return md5.hexdigest()
def paqu(self):
ts = str(int(time.time()*1000))
salt = str(int(time.time()*1000) + random.randint(0,10))
data = {
'i': self.value,
'from':'AUTO',
'to': 'AUTO',
'smartresult': 'dict',
'client': 'fanyideskweb',
'salt':salt,
'sign': self.get_md5("fanyideskweb" + self.value + salt + "n%A-rKaT5fb[Gy?;N5@Tj"),
'ts': ts,
'bv': '6463522ba46bac94c96fd37965fadc8d',
'doctype': 'json',
'version': '2.1',
'keyfrom': 'fanyi.web',
'action': 'FY_BY_REALTlME',
}
headers = {
'Accept':'application / json, text / javascript, * / *; q = 0.01',
'Accept - Encoding': 'gzip, deflate',
'Accept - Language': 'zh - CN, zh;q = 0.9',
'Connection': 'keep - alive',
'Content - Length': str(len(data)),
'Content - Type': 'application / x - www - form - urlencoded;charset = UTF - 8',
'Cookie':'_ga=GA1.2.1273856326.1567209202; OUTFOX_SEARCH_USER_ID_NCOO=1648286369.8185346; OUTFOX_SEARCH_USER_ID="-2106389152@10.168.11.144"; P_INFO=18077032768|1567209247|1|youdaonote|00&99|null&null&null#gud&440100#10#0|&0||18077032768; _ntes_nnid=c93fb2be47589e0b49f27b3019597fc2,1569057921486; _gid=GA1.2.1998169510.1569294659; JSESSIONID=aaaqNPfcQ7CmEha9BCM1w; ___rl__test__cookies=1569381827679',
'Host': 'fanyi.youdao.com',
'Origin': 'http: // fanyi.youdao.com',
'Referer':'http://fanyi.youdao.com/',
'X - Requested - With': 'XMLHttpRequest',
}
response = requests.post(url=self.url,headers=headers,data=data)
print(response.json())
if __name__ == '__main__':
value = 'dog'
base_url = 'http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule'
Youdao(base_url,value)
#get 和post 的不同
1、返回内容不同
get返回的是html页面,post返回的是请求的数据json对象,直接是请求的数据
2、get 请求传的是params ,有些网站没有params不用传
python response = requests.get(url==url,params==params,headers=self.headers)
post 传的是data 包含post请求的关键字
response = requests.post(url=self.url,headers=headers,data=data)
```
3、请求方法不同,一个是get , 一个是post,
post 请求headers需要传比较多的值,
4、找url不同,找url时要看网页的请求方式,如果和自己的请求方式不同,url就是错误
post找post
![在这里插入图片描述](https://img-blog.csdnimg.cn/20190925231222925.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3Bhbmp1bnhpYW8=,size_16,color_FFFFFF,t_70)
get 找get
![在这里插入图片描述](https://img-blog.csdnimg.cn/2019092523131649.png)
5、post很有可能是ajax请求在查看位置不同
![在这里插入图片描述](https://img-blog.csdnimg.cn/20190925231420321.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3Bhbmp1bnhpYW8=,size_16,color_FFFFFF,t_70)