一.爬取汽车之家上咨询文章的标题及链接
import requests,bs4
response=requests.get("https://www.autohome.com.cn/all/#pvareaid=3311230")
response.encoding=response.apparent_encoding
soup=bs4.BeautifulSoup(response.text,features="html.parser")
target=soup.find(id="auto-channel-lazyload-article")
li_list=target.find_all("li")
for obj in li_list:
i=obj.find("a")
if i:
print(i.find("h3").text)
img_url=i.attrs.get("href")
print(img_url)
二.自动登录抽屉新热榜并点赞
import requests
r1=requests.get("https://dig.chouti.com/")
r1_cookie_dict=r1.cookies.get_dict()#获取dict形式的Cookies
post_dict={
"phone":12345678901,
"password":"woshinibaba",
"OneMonth":1#将登录状态保留1个月
}
r2=requests.post(#进行登录操作
"https://dig.chouti.com/login",
data=post_dict,
cookies=r1_cookie_dict#发送过去进行授权
)
r3=requests.post(
"https://dig.chouti.com/link/vote?linksId=11832246",
cookies=r1.cookies.get("gpsd")#用于证明自己已经登陆
#注:登录成功后,Server会给r1中的gpsd授权,而r2中的gpsd没有任何用处(这是1个例外,通常使用r2中的Cookies)
)
注:目前已改版,该实例无法直接使用
三.自动登录GitHub并获取自己仓库中所有项目的名称
import requests,bs4
#访问登陆页面,获取authenticity_token:
i1=requests.get('https://github.com/login')
soup1=bs4.BeautifulSoup(i1.text,features='lxml')
tag=soup1.find(
name='input',
attrs={'name':'authenticity_token'}
)
authenticity_token=tag.get('value')
#获取第1次访问的Cookie:
c1=i1.cookies.get_dict()
i1.close()
#携带authenticity_token/username/pwd码等信息,发送用户验证:
form_data={
"authenticity_token":authenticity_token,
"utf8":"",
"commit":"Sign in",
"login":"12345678@gmail.com",
"password":"88888888"
}
i2=requests.post(
'https://github.com/session',
data=form_data,
cookies=c1
)
#获取第2次访问的Cookie:
c2=i2.cookies.get_dict()
#把c2中的内容更新到c1中(更新后c1包含2次Cookie中的所有内容):
c1.update(c2)
#访问自己的项目仓库:
i3=requests.get(
'https://github.com/settings/repositories',
cookies=c1
)
soup3=bs4.BeautifulSoup(i3.text,features='lxml')
list_group=soup3.find_all(name='div',class_='Box-row simple public fork js-collab-repo')
for child in list_group:
name=child.find(name="a").attrs.get("href")
print(name)#打印自己所有项目的名称
四.自动登录知乎
- 登录次数过多后,会要求输入图片中的验证码(可以进行图像识别,可以使用插件或购买API,也可以下载图片并手动输入)
import time,requests,bs4
#访问登录页面,获取xsrf属性的值(用于授权登录):
session=requests.Session()
i1=session.get(
url='https://www.zhihu.com/#signin',
headers={
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
}
)
soup1=bs4.BeautifulSoup(i1.text,'lxml')
xsrf_tag=soup1.find(name='input',attrs={'name':'_xsrf'})
xsrf=xsrf_tag.get('value')
#下载包含验证码的图片:
current_time=time.time()
i2=session.get(
url='https://www.zhihu.com/captcha.gif',
params={
'r':current_time,
'type':'login'
},
headers={
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
}
)
with open('zhihu.gif','wb') as f:
f.write(i2.content)
#输入验证码,进行登录:
captcha=input('请打开zhihu.gif文件,查看并输入验证码:')
form_data={
"_xsrf":xsrf,
'password':'xxooxxoo',
"captcha":'captcha',
'email':'424662508@qq.com'
}
i3=session.post(
url='https://www.zhihu.com/login/email',
data=form_data,
headers={
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
}
)
五.自动登录博客园
- 登录时会将用户名和密码加密后再发到后台
import re,json,base64,rsa,requests
def js_encrypt(text):#进行加密(cnblogs使用插件JSEncrypt进行加密)
#加密使用的公钥:
b64der='MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQCp0wHYbg/NOPO3nzMD3dndwS0MccuMeXCHgVlGOoYyFwLdS24Im2e7YyhB0wrUsyYf0/nhzCzBK8ZC9eCWqd0aHbdgOQT6CuFQBMjbyGYvlVYU2ZP7kG9Ft6YV6oc9ambuO7nPZh+bvXH0zDKfi02prknrScAKC0XhadTHT3Al0QIDAQAB'
der=base64.standard_b64decode(b64der)
pk=rsa.PublicKey.load_pkcs1_openssl_der(der)
v1=rsa.encrypt(bytes(text, 'utf8'), pk)
value=base64.encodebytes(v1).replace(b'\n', b'')
value=value.decode('utf8')
return value
session=requests.Session()
i1=session.get('https://passport.cnblogs.com/user/signin')
rep=re.compile("'VerificationToken':'(.*)'")
v=re.search(rep,i1.text)
verification_token=v.group(1)
form_data={
'input1':js_encrypt('wptawy'),
'input2':js_encrypt('asdfasdf'),
'remember':False
}
i2=session.post(url='https://passport.cnblogs.com/user/signin',
data=json.dumps(form_data),
headers={
'Content-Type':'application/json; charset=UTF-8',
'X-Requested-With':'XMLHttpRequest',
'VerificationToken':verification_token
}
)
i3=session.get(url='https://i.cnblogs.com/EditDiary.aspx')
print(i3.text)