先对requests请求库快速安装pip3 install requests
导入包:
import requests import time
爬虫三部曲:
1、发送请求
def get_page(url): response = requests.get(url) return response
2、解析数据
import re def parse_index(html): #findall匹配所有 # re.findall('正则匹配规则','匹配文本','匹配模式') #re.S:对全部文本进行搜索匹配 detail_urls=re.findall('div class="items"><a class="imglink" href="(.*?)"', html, re.S) return detail_urls #解析详情页 def parse_detail(html): movie_url=re.findall('<source src="(.*?)"',html,re.S) if movie_url: return movie_url[0]
3、保存数据
import uuid #uuid.uuid4()根据时间戳生成一段世界上唯一一段字符串 def save_video(content): with open(f'{uuid.uuid4()}.mp4','wb') as f: f.write(content)
实战案例:对校花网进行爬取
# main+回车键 if __name__ == '__main__': for line in range(6): url=f'http://www.xiaohuar.com/list-3-{line}.html' #发送请求 response = get_page(url) # print(response) # #返回响应状态码 # print(response.status_code) # #返回响应文本 # print(response.text) #解析主页页面 detail_urls = parse_index(response.text) #循环遍历详情页url for detail_url in detail_urls: # print(detail_url) #往每一个详情页发送请求 detail_res = get_page(detail_url) # print(response.text) #解析详情页获取视频url movie_url = parse_detail(detail_res.text) #判断url存在打印视频 if movie_url: print(movie_url) #往视频url发送请求获取视频二进制流 movie_res = get_page(movie_url) #把视频的二进制流给save_video函数去保存到本地 save_video(movie_res.content)
二、POST请求自动登录GitHub
1. 获取token字符串
'''
1.访问登录页面获取token字符串
请求URL:
https://github.com/login
请求方式:
GET
请求头:
Cookies
User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36
2.解析并提取token字符串
# 正则
<input type="hidden" name="authenticity_token" value="(.*?)"/>
'''
import requests import re login_url = 'https://github.com/login' #login页面请求头信息 login_headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' } login_res = requests.get(url=login_url,headers=login_headers) # print(longin_res.text)
#解析提取token字符串 authenticity_token = re.findall( '<input type="hidden" name="authenticity_token" value="(.*?)" />', login_res.text, re.S )[0] print(authenticity_token)
#获取login页面的cookies信息 # print(type(login_res.cookies)) # print(type(login_res.cookies.get_dict())) login_cookies = login_res.cookies.get_dict()
2、开始登录github
'''
POST请求自动登录hithub:
请求url:
https://github.com/session/
请求方式:
POST
请求头:
cookie
User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36
请求体:
commit:Sign in
utf8:✓
authenticity_token:28M+Bm0xt10QgEidyrICyo/53xxYWV0deet0sGQKPdoQG9FXPOqFHZjMQPHHc+RBlOfJTMplbpyJI7yoBZH0zw==
login:*****
password:********
webauthn-support:unsupported
'''
#session登录url session_url = 'https://github.com/session/' #请求头信息 session_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' } #请求体信息 form_data = { "commit": "Sign in", "utf8": "✓", "authenticity_token": authenticity_token, "login": "953831340@qq.com", #能成功登录的账号密码 "password": "SHAOhh170326", "webauthn-support": "supported" } session_res = requests.post(url=session_url,headers=session_headers,cookies=login_cookies,data=form_data) with open('github.html','w',encoding='utf-8') as f: f.write(session_res.text)
在生成的github.html文件里用浏览器打开可以看到不用输账号密码可以直接登录github
在刚开始尝试的时候用的是QQ浏览器,最后登录时会显示账号密码错误,后面换成谷歌浏览器,并把User-Agent等数据修改后就可以正常操作了