例子1.requests.session登录github
例子2 post提交数据
例子3 使用代理访问网页
例子4 一个爬取贴吧帖子标题和链接的程序
1.requests.session登录github
import requests
import re
def dologin():
session = requests.session()
session.headers = {
"user-agent": "Mozilla / 5.0(Windows NT 10.0;WOW64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 86.0.4240.198 Safari / 537.36"
}
url1 ="https://github.com/login"
res1 = session.get(url1).content.decode()
token = re.findall('name="authenticity_token" value="(.*?)" />', res1)[0]
print(token)
url2 = "https://github.com/session"
data = {
"commit": "Sign in",
"authenticity_token": token,
"login": "username@qq.com",#账号
"password": "userpass",# 密码
"webauthn-support": "support",
}
session.post(url2, data)
url3 = "https://github.com/settings/profile"
response = session.get(url3)
#print(response.content)
with open("login.html","wb") as f:
f.write(response.content)
if __name__ == '__main__':
dologin()
2.post提交数据
import requests
import json
class baidu():
def __init__(self,word):
self.url ="http://fy.iciba.com/Ajax.php?afy"
self.header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
}
self.data = {
"f": "auto",
"t": "auto",
"w": word,
}
def get_data(self):
response = requests.post(self.url, data=self.data, headers= self.header)
return response.content
def run(self):
response = self.get_data()
print(response.decode())
if __name__ == '__main__':
bd = baidu("人们")
bd.run()
3.使用代理
import requests
url ="https://www.ip138.com/"
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
}
data = {
"wd": "kof97"
}
proxy = {
"http": "http://58.243.28.155:4245"
}
response =requests.get(url, proxies=proxy)
response.encoding='gbk'
print(response.text)
with open("baidu.html","wb")as f:
f.write(response.content)
4.一个爬取贴吧帖子标题和链接的例子
import requests
from lxml import etree
class tieba(object):
def __init__(self,name):
self.url = "https://tieba.baidu.com/f?kw=f&kw="+name
self.header ={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
#"User-Agent": "Mozilla/4.0 (comptible; MSIE 5.01; Windows NT 5.0; DigExt)"
}
def getdata(self,url):
rsp = requests.get(url,headers=self.header).content.decode("utf-8")
#print(rsp)
return rsp
def pd(self, data):
data = data.replace("<!--", "").replace("-->", "")
html = etree.HTML(data)
# print(etree.tostring(html))
#html.xpath()
a = html.xpath('//*[@class="j_th_tit "]')
print(len(a))
url_list =[]
for el in a:
#print(el)
tmp ={}
tmp['title'] = el.xpath("./text()")[0]
tmp['link'] = 'https://tieba.baidu.com/'+el.xpath("./@href")[0]
url_list.append(tmp)
print(url_list)
try:
nexturl = 'https:' + html.xpath('//a[@class="next pagination-item "]/@href')[0]
except:
nexturl =None
# print(nexturl)
return url_list,nexturl
def run(self):
nexturl = self.url
while True:
data = self.getdata(nexturl)
url_list,nexturl = self.pd(data)
if nexturl == None:
break
if __name__ == '__main__':
tb = tieba("红名")
tb.run()